4187c1c7IWmBinGdI19kL4MuZ6RLbQ docs/check_pkgs
3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/figs/xenlogo.eps
418a3248xjIqmNKo0v_XQSfAvlBGFw docs/html.sty
+41c0c4116itF389v0CEWcmzue6zJkA docs/misc/VMX_changes.txt
4022a73cgxX1ryj1HgS-IwwB6NUi2A docs/misc/XenDebugger-HOWTO
412f4bd9sm5mCQ8BkrgKcAKZGadq7Q docs/misc/blkif-drivers-explained.txt
40d6ccbfKKBq8jE0ula4eHEzBiQuDA docs/misc/xen_config.html
3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c
3ddb79bccYVzXZJyVaxuv5T42Z1Fsw xen/arch/x86/trampoline.S
3ddb79bcOftONV9h4QCxXOfiT0h91w xen/arch/x86/traps.c
+41c0c411tD3C7TpfDMiFTf7BaNd_Dg xen/arch/x86/vmx.c
+41c0c411ODt8uEmV-yUxpQLpqimE5Q xen/arch/x86/vmx_io.c
+41c0c4128URE0dxcO15JME_MuKBPfg xen/arch/x86/vmx_vmcs.c
419cbedeQDg8IrO3izo3o5rQNlo0kQ xen/arch/x86/x86_32/asm-offsets.c
3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c
3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S
3ddb79c34BFiXjBJ_cCKB0aCsV1IDw xen/include/asm-x86/desc.h
40715b2dTokMLYGSuD58BnxOqyWVew xen/include/asm-x86/div64.h
3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-x86/domain_page.h
+41c0c412Ufq5sAvri3dMHC1BXiO6Gw xen/include/asm-x86/e820.h
3ddb79c3NU8Zy40OTrq3D-i30Y3t4A xen/include/asm-x86/fixmap.h
3e2d29944GI24gf7vOP_7x8EyuqxeA xen/include/asm-x86/flushtlb.h
3ddb79c39o75zPP0T1aQQ4mNrCAN2w xen/include/asm-x86/hardirq.h
3ddb79c3ezddh34MdelJpa5tNR00Dw xen/include/asm-x86/system.h
3ddb79c4HugMq7IYGxcQKFBpKwKhzA xen/include/asm-x86/types.h
40cf1596saFaHD5DC5zvrSn7CDCWGQ xen/include/asm-x86/uaccess.h
+41c0c412k6GHYF3cJtDdw37ee3TVaw xen/include/asm-x86/vmx.h
+41c0c412hck3QX-6_MaXaISGkngQuA xen/include/asm-x86/vmx_cpu.h
+41c0c41243jC1mcArZx_t3YkBL4lTA xen/include/asm-x86/vmx_platform.h
+41c0c412lQ0NVVN9PsOSznQ-qhOiPA xen/include/asm-x86/vmx_vmcs.h
418fbcfe_WliJPToeVM-9VStvym-hw xen/include/asm-x86/x86_32/asm_defns.h
3ddb79c2ADvRmdexd9y3AYK9_NTx-Q xen/include/asm-x86/x86_32/current.h
3ddb79c3mbqEM7QQr3zVq7NiBNhouA xen/include/asm-x86/x86_32/regs.h
4121d149udGfSUGhn3k1ECz0bM31nQ xen/include/public/grant_table.h
40f5623bqoi4GEoBiiUc6TZk1HjsMg xen/include/public/io/blkif.h
40dc4076pVeE1kEEWzcUaNZin65kCA xen/include/public/io/domain_controller.h
+41c0c412FLc0gunlJl91qMYscFtXVA xen/include/public/io/ioreq.h
40f5623cTZ80EwjWUBlh44A9F9i_Lg xen/include/public/io/netif.h
4051db79512nOCGweabrFWO2M2h5ng xen/include/public/physdev.h
40589968wmhPmV5-ENbBYmMjnedgKw xen/include/public/sched_ctl.h
--- /dev/null
+Changes to Xen in support of Intel(R) Vanderpool Technology
+-------------------------------------------------------------
+
+Our VT extensions to the Xen hypervisor provide full platform
+virtualization, including CPU(s), memory, and I/O infrastructure. The
+generic code in Xen handles and schedules those virtual machines as it
+does for the existing para-virtualized domains.
+
+Full virtualization required by the OS guests requires full device
+virtualization as well. The device models in BOCHS
+(http://bochs.sourceforge.net/) were decoupled from the CPU
+virtualization, and are used to virtualize the legacy devices (such as
+keyboard, mouse, VGA, IDE) in the PC platform. At this point, the
+device models run in user mode on domain 0, not in the Xen hypervisor.
+
+We would like to thank Ian Pratt and Keir Fraser for reviewing our
+design and code intensively, and for providing numerous useful
+suggestions to improve the architecture and code.
+
+We have a list of Intel team members who take credit for making this
+release happen: Yunhong Jiang, Nitin Kamble, Chengyuan Li, Xin Li,
+Xiaofeng Ling, Benjamin Liu, Asit Mallick, Jun Nakajima, Sunil Saxena,
+Arun Sharma, Edwin Zhai, Jeff Zheng, and Louis Zhuang. We'll continue
+to add more features to complete full virtualization in Xen using VT.
+
+The notes document the changes to the Xen hypervisor in order to add
+VT support. The changes to other areas, such as Control Panel will be
+added as we deliver the code.
+
+Summary of changes for the first release
+----------------------------------------
+December 15, 2004
+
+ * VT specific event handling and domain management were added.
+
+ * Shadow mode was extended to support full 32-bit guests
+
+ * Domain switching code was extended to support VT domain
+
+ * I/O request handling was added to communicate with the device model
+
+ * Domain builder was extended to provide the environment when the
+ guest enters the protected mode, including E820 memory and VGA
+ info, typically obtained by BIOS calls.
+
+New code:
+---------
+ VT (Vanderpool Technology) is based on the new VMX (Virtual
+ Machine Extensions) architecture. The current release of the
+ software supports 32-bit only.
+
+ * arch/x86/vmx.[ch] and arch/x86/vmx_*.[ch]: created to handle
+ VMX-specific events in order to provide virtual machine.
+
+ * arch/x86/x86_32/entry.S: new code path was added to have the
+ first-level handler from VM exits. The first-level handler calls
+ the second-level handler in arch/x86/vmx.c.
+
+ * arch/x86/setup.c: new function start_vmx() to init_intel() to
+ enable VMX mode.
+
+ * include/asm-x86/config.h: #ifdef CONFIG_VMX was added.
+
+ * arch/x86/domain.c: new code patch was added to create a VMX
+ domain given the flag from the control panel.
+
+ * include/public/io/ioreq.h: A new data structure was added to
+ define the I/O requests between the Xen hypervisor and the
+ device models.
+
+Changes to the existing code:
+-----------------------------
+
+ * arch/x86/shadow.[ch]: new mode SHM_full_32 was added to support
+ full virtualization. The current Xen code assumes that the guest
+ page directory and tables have _machine_ (or host) physical page
+ frame numbers, and the new code allows to support _guest_
+ physical page frame numbers
+
+ * include/asm-x86/processor.h: struct arch_vmx_struct arch_vmx has
+ been added to the thread_struct data structure. The arch_vmx has
+ the addtional VMX-related CPU context.
+
+ * arch/x86/io_apic.c: reverse mapping between vector and irq has
+ been added. We will revisit this code when considering MSI
+ support.
+
+--- Jun
+
+
#include <asm/shadow.h>
#include <xen/console.h>
#include <xen/elf.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <xen/kernel.h>
+#include <public/io/ioreq.h>
#include <xen/multicall.h>
#if !defined(CONFIG_X86_64BITMODE)
smp_send_stop();
disable_IO_APIC();
#endif
+#ifdef CONFIG_VMX
+ stop_vmx();
+#endif
if(!reboot_thru_bios) {
/* rebooting needs to touch the page at absolute addr 0 */
}
}
+#ifdef CONFIG_VMX
+void arch_vmx_do_resume(struct exec_domain *d)
+{
+ vmx_do_resume(d);
+ reset_stack_and_jump(vmx_asm_do_resume);
+}
+
+void arch_vmx_do_launch(struct exec_domain *d)
+{
+ vmx_do_launch(d);
+ reset_stack_and_jump(vmx_asm_do_launch);
+}
+
+static void monitor_mk_pagetable(struct exec_domain *ed)
+{
+ unsigned long mpfn;
+ l2_pgentry_t *mpl2e;
+ struct pfn_info *mpfn_info;
+ struct mm_struct *m = &ed->mm;
+ struct domain *d = ed->domain;
+
+ mpfn_info = alloc_domheap_page(NULL);
+ ASSERT( mpfn_info );
+
+ mpfn = (unsigned long) (mpfn_info - frame_table);
+ mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT);
+ memset(mpl2e, 0, PAGE_SIZE);
+
+ memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+ m->monitor_table = mk_pagetable(mpfn << PAGE_SHIFT);
+ m->shadow_mode = SHM_full_32;
+
+ mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK)
+ | __PAGE_HYPERVISOR);
+
+ unmap_domain_mem(mpl2e);
+}
+
+static int vmx_final_setup_guestos(struct exec_domain *d,
+ full_execution_context_t *full_context)
+{
+ int error;
+ execution_context_t *context;
+ struct vmcs_struct *vmcs;
+ unsigned long guest_pa;
+
+ context = &full_context->cpu_ctxt;
+
+ /*
+ * Create a new VMCS
+ */
+ if (!(vmcs = alloc_vmcs())) {
+ printk("Failed to create a new VMCS\n");
+ return -ENOMEM;
+ }
+
+ memset(&d->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct));
+
+ d->thread.arch_vmx.vmcs = vmcs;
+ error = construct_vmcs(&d->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
+ if (error < 0) {
+ printk("Failed to construct a new VMCS\n");
+ goto out;
+ }
+
+ monitor_mk_pagetable(d);
+ guest_pa = pagetable_val(d->mm.pagetable);
+ clear_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state);
+
+ d->thread.arch_vmx.vmx_platform.real_mode_data =
+ (unsigned long *) context->esi;
+
+ memset(&d->domain->shared_info->evtchn_mask[0], 0xff,
+ sizeof(d->domain->shared_info->evtchn_mask));
+ clear_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_mask[0]);
+
+ d->thread.schedule_tail = arch_vmx_do_launch;
+
+ return 0;
+
+out:
+ free_vmcs(vmcs);
+ d->thread.arch_vmx.vmcs = 0;
+ return error;
+}
+#endif
+
int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
{
unsigned long phys_basetab;
}
}
+#ifdef CONFIG_VMX
+ if (c->flags & ECF_VMX_GUEST)
+ return vmx_final_setup_guestos(d, c);
+#endif
+
return 0;
}
struct tss_struct *tss = init_tss + smp_processor_id();
execution_context_t *stack_ec = get_execution_context();
int i;
-
+ unsigned long vmx_domain = next_p->thread.arch_vmx.flags;
+
__cli();
/* Switch guest general-register state. */
&next_p->thread.user_ctxt,
sizeof(*stack_ec));
- SET_FAST_TRAP(&next_p->thread);
-
- /* Switch the guest OS ring-1 stack. */
- tss->esp1 = next->guestos_sp;
- tss->ss1 = next->guestos_ss;
-
/* Maybe switch the debug registers. */
if ( unlikely(next->debugreg[7]) )
{
loaddebug(next, 7);
}
+ if (vmx_domain) {
+ /* Switch page tables. */
+ write_ptbase(&next_p->mm);
+
+ set_current(next_p);
+ /* Switch GDT and LDT. */
+ __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
+
+ __sti();
+ return;
+ }
+
+ SET_FAST_TRAP(&next_p->thread);
+
+ /* Switch the guest OS ring-1 stack. */
+ tss->esp1 = next->guestos_sp;
+ tss->ss1 = next->guestos_ss;
+
/* Switch page tables. */
write_ptbase(&next_p->mm);
}
int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
+#ifdef CONFIG_VMX
+int vector_irq[256];
+#endif
+
static int __init assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
panic("ran out of interrupt sources!");
IO_APIC_VECTOR(irq) = current_vector;
+#ifdef CONFIG_VMX
+ vector_irq[current_vector] = irq;
+ printk("vector_irq[%x] = %d\n", current_vector, irq);
+#endif
return current_vector;
}
}
}
#endif
+
+#ifdef CONFIG_VMX
+ start_vmx();
+#endif
+
}
static void __init init_amd(struct cpuinfo_x86 *c)
/* We clear L2 pages by zeroing the guest entries. */
case PGT_l2_page_table:
p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
- memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
+ if (m->shadow_mode == SHM_full_32)
+ memset(p, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
+ else
+ memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
unmap_domain_mem(p);
break;
free_shadow_page(&d->exec_domain[0]->mm, &frame_table[spfn]);
}
+#ifdef CONFIG_VMX
+void vmx_shadow_clear_state(struct mm_struct *m)
+{
+ SH_VVLOG("vmx_clear_shadow_state: \n");
+ clear_shadow_state(m);
+}
+#endif
+
+
unsigned long shadow_l2_table(
struct mm_struct *m, unsigned long gpfn)
{
struct pfn_info *spfn_info;
unsigned long spfn;
- l2_pgentry_t *spl2e;
+ l2_pgentry_t *spl2e = 0, *gpl2e;
+ unsigned long guest_gpfn;
+
+ __get_machine_to_phys(m, guest_gpfn, gpfn);
SH_VVLOG("shadow_l2_table( %08lx )", gpfn);
perfc_incr(shadow_l2_pages);
spfn = spfn_info - frame_table;
-
- /* Mark pfn as being shadowed; update field to point at shadow. */
- set_shadow_status(m, gpfn, spfn | PSH_shadowed);
+ /* Mark pfn as being shadowed; update field to point at shadow. */
+ set_shadow_status(m, guest_gpfn, spfn | PSH_shadowed);
- spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
-
- /*
- * We could proactively fill in PDEs for pages that are already shadowed.
- * However, we tried it and it didn't help performance. This is simpler.
- */
- memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
#ifdef __i386__
/* Install hypervisor and 2x linear p.t. mapings. */
- memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
- spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
- spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
- spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) |
- __PAGE_HYPERVISOR);
+ if (m->shadow_mode == SHM_full_32)
+ vmx_update_shadow_state(m, gpfn, spfn);
+ else {
+ spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
+ // can't use the linear map as we may not be in the right PT
+ gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+ /*
+ * We could proactively fill in PDEs for pages that are already shadowed.
+ * However, we tried it and it didn't help performance. This is simpler.
+ */
+ memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+ /* Install hypervisor and 2x linear p.t. mapings. */
+ memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+ spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) |
+ __PAGE_HYPERVISOR);
+ }
#endif
- unmap_domain_mem(spl2e);
+ if (m->shadow_mode != SHM_full_32)
+ {
+ unmap_domain_mem(spl2e);
+ }
SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn);
return spfn;
static void shadow_map_l1_into_current_l2(unsigned long va)
{
struct mm_struct *m = ¤t->mm;
- unsigned long *gpl1e, *spl1e, gpde, spde, gl1pfn, sl1pfn, sl1ss;
+ unsigned long *gpl1e, *spl1e, gpl2e, spl2e, gl1pfn, sl1pfn=0, sl1ss;
struct pfn_info *sl1pfn_info;
int i;
- gpde = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+ __guest_get_pl2e(m, va, &gpl2e);
- gl1pfn = gpde >> PAGE_SHIFT;
+ gl1pfn = gpl2e >> PAGE_SHIFT;
sl1ss = __shadow_status(m, gl1pfn);
if ( !(sl1ss & PSH_shadowed) )
set_shadow_status(m, gl1pfn, PSH_shadowed | sl1pfn);
- l2pde_general(m, &gpde, &spde, sl1pfn);
+ l2pde_general(m, &gpl2e, &spl2e, sl1pfn);
- linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
- shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry(spde);
+ __guest_set_pl2e(m, va, gpl2e);
+ __shadow_set_pl2e(m, va, spl2e);
gpl1e = (unsigned long *) &(linear_pg_table[
(va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]);
SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn);
sl1pfn = sl1ss & PSH_pfn_mask;
- l2pde_general(m, &gpde, &spde, sl1pfn);
-
- linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
- shadow_linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
+ l2pde_general(m, &gpl2e, &spl2e, sl1pfn);
+ __guest_set_pl2e(m, va, gpl2e);
+ __shadow_set_pl2e(m, va, spl2e);
}
}
+#ifdef CONFIG_VMX
+void vmx_shadow_invlpg(struct mm_struct *m, unsigned long va)
+{
+ unsigned long gpte, spte, host_pfn;
+
+ if (__put_user(0L, (unsigned long *)
+ &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
+ vmx_shadow_clear_state(m);
+ return;
+ }
+
+ if (__get_user(gpte, (unsigned long *)
+ &linear_pg_table[va >> PAGE_SHIFT])) {
+ return;
+ }
+
+ host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+ spte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+
+ if (__put_user(spte, (unsigned long *)
+ &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
+ return;
+ }
+}
+#endif
+
int shadow_fault(unsigned long va, long error_code)
{
unsigned long gpte, spte;
int level, int i)
{
unsigned long mask, gpfn, spfn;
+#ifdef CONFIG_VMX
+ unsigned long guest_gpfn;
+#endif
if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
return 1; /* always safe */
if ( level < 2 )
FAIL("Shadow in L1 entry?");
- if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
- FAIL("spfn problem g.sf=%08lx", __shadow_status(m, gpfn));
+ if (m->shadow_mode == SHM_full_32) {
+
+ guest_gpfn = phys_to_machine_mapping[gpfn];
+
+ if ( __shadow_status(m, guest_gpfn) != (PSH_shadowed | spfn) )
+ FAIL("spfn problem g.sf=%08lx",
+ __shadow_status(m, guest_gpfn) );
+
+ } else {
+ if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
+ FAIL("spfn problem g.sf=%08lx",
+ __shadow_status(m, gpfn) );
+ }
+
}
return 1;
unsigned long gpfn, spfn;
int i;
l2_pgentry_t *gpl2e, *spl2e;
+ unsigned long host_gpfn = 0;
sh_check_name = s;
gpfn = gptbase >> PAGE_SHIFT;
- if ( !(__shadow_status(m, gpfn) & PSH_shadowed) )
+ __get_phys_to_machine(m, host_gpfn, gpfn);
+
+ if ( ! (__shadow_status(m, gpfn) & PSH_shadowed) )
{
printk("%s-PT %08lx not shadowed\n", s, gptbase);
- if ( __shadow_status(m, gpfn) != 0 )
- BUG();
- return 0;
- }
+
+ if( __shadow_status(m, gpfn) != 0 ) BUG();
+ return 0;
+ }
spfn = __shadow_status(m, gpfn) & PSH_pfn_mask;
- if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
- FAILPT("ptbase shadow inconsistent1");
+ if ( ! __shadow_status(m, gpfn) == (PSH_shadowed | spfn) )
+ FAILPT("ptbase shadow inconsistent1");
+
+ if (m->shadow_mode == SHM_full_32)
+ {
+ host_gpfn = phys_to_machine_mapping[gpfn];
+ gpl2e = (l2_pgentry_t *) map_domain_mem( host_gpfn << PAGE_SHIFT );
+
+ } else
+ gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
- gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
{
- printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
i++ )
L2_PAGETABLE_SHIFT]),
(spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
- if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) |
+ if (m->shadow_mode != SHM_full_32) {
+ if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+ ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) |
__PAGE_HYPERVISOR))) )
- FAILPT("hypervisor per-domain map inconsistent");
-
+ FAILPT("hypervisor per-domain map inconsistent");
+ }
/* Check the whole L2. */
for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */
static rwlock_t time_lock = RW_LOCK_UNLOCKED;
-static void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs)
+void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs)
{
write_lock_irq(&time_lock);
--- /dev/null
+/*
+ * vmx.c: handling VMX architecture-related VM exits
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/shadow.h>
+#include <asm/regs.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/types.h>
+#include <asm/msr.h>
+#include <asm/spinlock.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <public/io/ioreq.h>
+
+int vmcs_size;
+unsigned int opt_vmx_debug_level;
+
+int start_vmx()
+{
+ struct vmcs_struct *vmcs;
+ unsigned long ecx;
+ u64 phys_vmcs; /* debugging */
+
+ vmcs_size = VMCS_SIZE;
+ /*
+ * Xen does not fill x86_capability words except 0.
+ */
+ ecx = cpuid_ecx(1);
+ boot_cpu_data.x86_capability[4] = ecx;
+
+ if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
+ return 0;
+
+ set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */
+
+ if (!(vmcs = alloc_vmcs())) {
+ printk("Failed to allocate VMCS\n");
+ return 0;
+ }
+
+ phys_vmcs = (u64) virt_to_phys(vmcs);
+
+ if (!(__vmxon(phys_vmcs))) {
+ printk("VMXON is done\n");
+ }
+
+ return 1;
+}
+
+void stop_vmx()
+{
+ if (test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability))
+ __vmxoff();
+}
+
+/*
+ * Not all cases recevie valid value in the VM-exit instruction length field.
+ */
+#define __get_instruction_length(len) \
+ __vmread(INSTRUCTION_LEN, &(len)); \
+ if ((len) < 1 || (len) > 15) \
+ __vmx_bug(®s);
+
+static void inline __update_guest_eip(unsigned long inst_len)
+{
+ unsigned long current_eip;
+
+ __vmread(GUEST_EIP, ¤t_eip);
+ __vmwrite(GUEST_EIP, current_eip + inst_len);
+}
+
+
+#include <asm/domain_page.h>
+
+static int vmx_do_page_fault(unsigned long va, unsigned long error_code)
+{
+ unsigned long eip, pfn;
+ unsigned int index;
+ unsigned long gpde = 0;
+ int result;
+ struct exec_domain *ed = current;
+ struct mm_struct *m = &ed->mm;
+
+#if VMX_DEBUG
+ {
+ __vmread(GUEST_EIP, &eip);
+ VMX_DBG_LOG(DBG_LEVEL_VMMU,
+ "vmx_do_page_fault = 0x%lx, eip = %lx, erro_code = %lx\n",
+ va, eip, error_code);
+ }
+#endif
+ /*
+ * Set up guest page directory cache to make linear_pt_table[] work.
+ */
+ __guest_get_pl2e(m, va, &gpde);
+ if (!(gpde & _PAGE_PRESENT))
+ return 0;
+
+ index = (va >> L2_PAGETABLE_SHIFT);
+ if (!l2_pgentry_val(m->guest_pl2e_cache[index])) {
+ pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
+
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault: pagetable = %lx\n",
+ pagetable_val(m->pagetable));
+
+ m->guest_pl2e_cache[index] =
+ mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ }
+
+ if ((result = shadow_fault(va, error_code)))
+ return result;
+
+ return 0; /* failed to resolve, i.e raise #PG */
+}
+
+static void vmx_do_general_protection_fault(struct xen_regs *regs)
+{
+ unsigned long eip, error_code;
+
+ __vmread(GUEST_EIP, &eip);
+ __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "vmx_general_protection_fault: eip = %lx, erro_code = %lx\n",
+ eip, error_code);
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n",
+ regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi);
+
+ __vmx_bug(regs);
+}
+
+static void vmx_vmexit_do_cpuid(unsigned long input, struct xen_regs *regs)
+{
+ int eax, ebx, ecx, edx;
+ unsigned long eip;
+
+ __vmread(GUEST_EIP, &eip);
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "do_cpuid: (eax) %x, (ebx) %x, (ecx) %x, (edx) %x, (esi) %x, (edi) %x\n", regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi);
+
+ cpuid(input, &eax, &ebx, &ecx, &edx);
+
+ if (input == 1) {
+ clear_bit(X86_FEATURE_PSE, &edx);
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
+ }
+
+ regs->eax = (unsigned long) eax;
+ regs->ebx = (unsigned long) ebx;
+ regs->ecx = (unsigned long) ecx;
+ regs->edx = (unsigned long) edx;
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x\n",
+ eip, input, eax, ebx, ecx, edx);
+
+}
+
+#define CASE_GET_REG_P(REG, reg) \
+ case REG_ ## REG: reg_p = &(regs->reg); break
+
+static void vmx_dr_access (unsigned long exit_qualification, struct xen_regs *regs)
+{
+ unsigned int reg;
+ u32 *reg_p = 0;
+ struct exec_domain *ed = current;
+ u32 eip;
+
+ __vmread(GUEST_EIP, &eip);
+
+ reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "vmx_dr_access : eip=%08x, reg=%d, exit_qualification = %lx\n",
+ eip, reg, exit_qualification);
+
+ switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
+ CASE_GET_REG_P(EAX, eax);
+ CASE_GET_REG_P(ECX, ecx);
+ CASE_GET_REG_P(EDX, edx);
+ CASE_GET_REG_P(EBX, ebx);
+ CASE_GET_REG_P(EBP, ebp);
+ CASE_GET_REG_P(ESI, esi);
+ CASE_GET_REG_P(EDI, edi);
+ case REG_ESP:
+ break;
+ default:
+ __vmx_bug(regs);
+ }
+
+ switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
+ case TYPE_MOV_TO_DR:
+ /* don't need to check the range */
+ if (reg != REG_ESP)
+ ed->thread.debugreg[reg] = *reg_p;
+ else {
+ unsigned long value;
+ __vmread(GUEST_ESP, &value);
+ ed->thread.debugreg[reg] = value;
+ }
+ break;
+ case TYPE_MOV_FROM_DR:
+ if (reg != REG_ESP)
+ *reg_p = ed->thread.debugreg[reg];
+ else {
+ __vmwrite(GUEST_ESP, ed->thread.debugreg[reg]);
+ }
+ break;
+ }
+}
+
+/*
+ * Invalidate the TLB for va. Invalidate the shadow page corresponding
+ * the address va.
+ */
+static void vmx_vmexit_do_invlpg(unsigned long va)
+{
+ unsigned long eip;
+ struct exec_domain *d = current;
+ unsigned int index;
+
+ __vmread(GUEST_EIP, &eip);
+
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%08lx, va=%08lx\n",
+ eip, va);
+
+ /*
+ * We do the safest things first, then try to update the shadow
+ * copying from guest
+ */
+ vmx_shadow_invlpg(&d->mm, va);
+ index = (va >> L2_PAGETABLE_SHIFT);
+ d->mm.guest_pl2e_cache[index] = mk_l2_pgentry(0); /* invalidate pgd cache */
+}
+
+static inline void guest_pl2e_cache_invalidate(struct mm_struct *m)
+{
+ /*
+ * Need to optimize this
+ */
+ memset(m->guest_pl2e_cache, 0, PAGE_SIZE);
+}
+
+static inline unsigned long gva_to_gpa(unsigned long gva)
+{
+ unsigned long gpde, gpte, pfn, index;
+ struct exec_domain *d = current;
+ struct mm_struct *m = &d->mm;
+
+ __guest_get_pl2e(m, gva, &gpde);
+ index = (gva >> L2_PAGETABLE_SHIFT);
+
+ pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
+
+ m->guest_pl2e_cache[index] =
+ mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+ if ( unlikely(__get_user(gpte, (unsigned long *)
+ &linear_pg_table[gva >> PAGE_SHIFT])) )
+ {
+ printk("gva_to_gpa EXIT: read gpte faulted" );
+ return 0;
+ }
+
+ if ( !(gpte & _PAGE_PRESENT) )
+ {
+ printk("gva_to_gpa - EXIT: gpte not present (%lx)",gpte );
+ return 0;
+ }
+
+ return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK);
+}
+
+static void vmx_io_instruction(struct xen_regs *regs,
+ unsigned long exit_qualification, unsigned long inst_len)
+{
+ struct exec_domain *d = current;
+ vcpu_iodata_t *vio;
+ ioreq_t *p;
+ unsigned long addr;
+ unsigned long eip;
+
+ extern long evtchn_send(int lport);
+ extern long do_block(void);
+
+ __vmread(GUEST_EIP, &eip);
+
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "vmx_io_instruction: eip=%08lx, exit_qualification = %lx\n",
+ eip, exit_qualification);
+
+ if (test_bit(6, &exit_qualification))
+ addr = (exit_qualification >> 16) & (0xffff);
+ else
+ addr = regs->edx & 0xffff;
+
+ if (addr == 0x80) {
+ __update_guest_eip(inst_len);
+ return;
+ }
+
+ vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+ if (vio == 0) {
+ VMX_DBG_LOG(DBG_LEVEL_1, "bad shared page: %lx\n", (unsigned long) vio);
+ domain_crash();
+ }
+ p = &vio->vp_ioreq;
+ p->dir = test_bit(3, &exit_qualification);
+ set_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags);
+
+ p->pdata_valid = 0;
+ p->count = 1;
+ p->size = (exit_qualification & 7) + 1;
+
+ if (test_bit(4, &exit_qualification)) {
+ p->pdata_valid = 1;
+ p->u.pdata = (void *) ((p->dir == IOREQ_WRITE) ?
+ regs->esi
+ : regs->edi);
+ p->u.pdata = (void *) gva_to_gpa(p->u.data);
+ if (test_bit(5, &exit_qualification))
+ p->count = regs->ecx;
+ if ((p->u.data & PAGE_MASK) !=
+ ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) {
+ printk("stringio crosses page boundary!\n");
+ if (p->u.data & (p->size - 1)) {
+ printk("Not aligned I/O!\n");
+ domain_crash();
+ }
+ p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size;
+ } else {
+ __update_guest_eip(inst_len);
+ }
+ } else if (p->dir == IOREQ_WRITE) {
+ p->u.data = regs->eax;
+ __update_guest_eip(inst_len);
+ } else
+ __update_guest_eip(inst_len);
+
+ p->addr = addr;
+ p->port_mm = 0;
+ p->state = STATE_IOREQ_READY;
+ evtchn_send(IOPACKET_PORT);
+ do_block();
+}
+
+#define CASE_GET_REG(REG, reg) \
+ case REG_ ## REG: value = regs->reg; break
+
+/*
+ * Write to control registers
+ */
+static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
+{
+ unsigned long value;
+ unsigned long old_cr;
+ struct exec_domain *d = current;
+
+ switch (gp) {
+ CASE_GET_REG(EAX, eax);
+ CASE_GET_REG(ECX, ecx);
+ CASE_GET_REG(EDX, edx);
+ CASE_GET_REG(EBX, ebx);
+ CASE_GET_REG(EBP, ebp);
+ CASE_GET_REG(ESI, esi);
+ CASE_GET_REG(EDI, edi);
+ case REG_ESP:
+ __vmread(GUEST_ESP, &value);
+ break;
+ default:
+ printk("invalid gp: %d\n", gp);
+ __vmx_bug(regs);
+ }
+
+ VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, \n", cr, value);
+ VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx, \n", (unsigned long) current);
+
+ switch(cr) {
+ case 0:
+ {
+ unsigned long old_base_pfn = 0, pfn;
+
+ /*
+ * CR0:
+ * We don't want to lose PE and PG.
+ */
+ __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG));
+ __vmwrite(CR0_READ_SHADOW, value);
+
+ if (value & (X86_CR0_PE | X86_CR0_PG) &&
+ !test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) {
+ /*
+ * Enable paging
+ */
+ set_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state);
+ /*
+ * The guest CR3 must be pointing to the guest physical.
+ */
+ if (!(pfn = phys_to_machine_mapping[
+ d->thread.arch_vmx.cpu_cr3 >> PAGE_SHIFT]))
+ {
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value = %lx\n",
+ d->thread.arch_vmx.cpu_cr3);
+ domain_crash(); /* need to take a clean path */
+ }
+ old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
+ /*
+ * Now mm.pagetable points to machine physical.
+ */
+ d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "New mm.pagetable = %lx\n",
+ (unsigned long) (pfn << PAGE_SHIFT));
+
+ shadow_lock(&d->mm);
+ shadow_mode_enable(d->domain, SHM_full_32);
+ shadow_unlock(&d->mm);
+
+ __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+ /*
+ * mm->shadow_table should hold the next CR3 for shadow
+ */
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, pfn = %lx\n",
+ d->thread.arch_vmx.cpu_cr3, pfn);
+ put_page_and_type(&frame_table[old_base_pfn]);
+
+ }
+ break;
+ }
+ case 3:
+ {
+ unsigned long pfn;
+
+ /*
+ * If paging is not enabled yet, simply copy the valut to CR3.
+ */
+ if (!test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) {
+ d->thread.arch_vmx.cpu_cr3 = value;
+ return;
+ }
+
+ guest_pl2e_cache_invalidate(&d->mm);
+ /*
+ * We make a new one if the shadow does not exist.
+ */
+ if (value == d->thread.arch_vmx.cpu_cr3) {
+ /*
+ * This is simple TLB flush, implying the guest has
+ * removed some translation or changed page attributes.
+ * We simply invalidate the shadow.
+ */
+ pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+ if ((pfn << PAGE_SHIFT) != pagetable_val(d->mm.pagetable))
+ __vmx_bug(regs);
+ vmx_shadow_clear_state(&d->mm);
+ shadow_invalidate(&d->mm);
+ } else {
+ /*
+ * If different, make a shadow. Check if the PDBR is valid
+ * first.
+ */
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx\n", value);
+ if ((value >> PAGE_SHIFT) > d->domain->max_pages)
+ {
+ VMX_DBG_LOG(DBG_LEVEL_VMMU,
+ "Invalid CR3 value=%lx\n", value);
+ domain_crash(); /* need to take a clean path */
+ }
+ pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+ vmx_shadow_clear_state(&d->mm);
+ d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+ shadow_mk_pagetable(&d->mm);
+ /*
+ * mm->shadow_table should hold the next CR3 for shadow
+ */
+ d->thread.arch_vmx.cpu_cr3 = value;
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx\n",
+ value);
+ __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+ }
+ break;
+ }
+ case 4:
+ /* CR4 */
+ if (value & X86_CR4_PAE)
+ __vmx_bug(regs); /* not implemented */
+ __vmread(CR4_READ_SHADOW, &old_cr);
+
+ __vmwrite(GUEST_CR4, (value | X86_CR4_VMXE));
+ __vmwrite(CR4_READ_SHADOW, value);
+
+ /*
+ * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
+ * all TLB entries except global entries.
+ */
+ if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
+ vmx_shadow_clear_state(&d->mm);
+ shadow_invalidate(&d->mm);
+ guest_pl2e_cache_invalidate(&d->mm);
+ }
+ break;
+ default:
+ printk("invalid cr: %d\n", gp);
+ __vmx_bug(regs);
+ }
+}
+
+#define CASE_SET_REG(REG, reg) \
+ case REG_ ## REG: \
+ regs->reg = value; \
+ break
+
+/*
+ * Read from control registers. CR0 and CR4 are read from the shadow.
+ */
+static void mov_from_cr(int cr, int gp, struct xen_regs *regs)
+{
+ unsigned long value;
+ struct exec_domain *d = current;
+
+ if (cr != 3)
+ __vmx_bug(regs);
+
+ value = (unsigned long) d->thread.arch_vmx.cpu_cr3;
+ ASSERT(value);
+
+ switch (gp) {
+ CASE_SET_REG(EAX, eax);
+ CASE_SET_REG(ECX, ecx);
+ CASE_SET_REG(EDX, edx);
+ CASE_SET_REG(EBX, ebx);
+ CASE_SET_REG(EBP, ebp);
+ CASE_SET_REG(ESI, esi);
+ CASE_SET_REG(EDI, edi);
+ case REG_ESP:
+ __vmwrite(GUEST_ESP, value);
+ regs->esp = value;
+ break;
+ default:
+ printk("invalid gp: %d\n", gp);
+ __vmx_bug(regs);
+ }
+
+ VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx, \n", cr, value);
+}
+
+static void vmx_cr_access (unsigned long exit_qualification, struct xen_regs *regs)
+{
+ unsigned int gp, cr;
+ unsigned long value;
+
+ switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
+ case TYPE_MOV_TO_CR:
+ gp = exit_qualification & CONTROL_REG_ACCESS_REG;
+ cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
+ mov_to_cr(gp, cr, regs);
+ break;
+ case TYPE_MOV_FROM_CR:
+ gp = exit_qualification & CONTROL_REG_ACCESS_REG;
+ cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
+ mov_from_cr(cr, gp, regs);
+ break;
+ case TYPE_CLTS:
+ __vmread(GUEST_CR0, &value);
+ value &= ~X86_CR0_TS; /* clear TS */
+ __vmwrite(GUEST_CR0, value);
+
+ __vmread(CR0_READ_SHADOW, &value);
+ value &= ~X86_CR0_TS; /* clear TS */
+ __vmwrite(CR0_READ_SHADOW, value);
+ break;
+ default:
+ __vmx_bug(regs);
+ break;
+ }
+}
+
+static inline void vmx_do_msr_read(struct xen_regs *regs)
+{
+ VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%x, eax=%x, edx=%x",
+ regs->ecx, regs->eax, regs->edx);
+
+ rdmsr(regs->ecx, regs->eax, regs->edx);
+
+ VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: ecx=%x, eax=%x, edx=%x",
+ regs->ecx, regs->eax, regs->edx);
+}
+
+/*
+ * Need to use this exit to rescheule
+ */
+static inline void vmx_vmexit_do_hlt()
+{
+ extern long do_block(void);
+#if VMX_DEBUG
+ unsigned long eip;
+ __vmread(GUEST_EIP, &eip);
+#endif
+ VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%08lx\n", eip);
+ __enter_scheduler();
+}
+
+static inline void vmx_vmexit_do_mwait()
+{
+#if VMX_DEBUG
+ unsigned long eip;
+ __vmread(GUEST_EIP, &eip);
+#endif
+ VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%08lx\n", eip);
+ __enter_scheduler();
+}
+
+#define BUF_SIZ 256
+#define MAX_LINE 80
+char print_buf[BUF_SIZ];
+static int index;
+
+static void vmx_print_line(const char c, struct exec_domain *d)
+{
+
+ if (index == MAX_LINE || c == '\n') {
+ if (index == MAX_LINE) {
+ print_buf[index++] = c;
+ }
+ print_buf[index] = '\0';
+ printk("(GUEST: %u) %s\n", d->domain->id, (char *) &print_buf);
+ index = 0;
+ }
+ else
+ print_buf[index++] = c;
+}
+
+#ifdef XEN_DEBUGGER
+void save_xen_regs(struct xen_regs *regs)
+{
+ __vmread(GUEST_SS_SELECTOR, ®s->xss);
+ __vmread(GUEST_ESP, ®s->esp);
+ __vmread(GUEST_EFLAGS, ®s->eflags);
+ __vmread(GUEST_CS_SELECTOR, ®s->xcs);
+ __vmread(GUEST_EIP, ®s->eip);
+
+ __vmread(GUEST_GS_SELECTOR, ®s->xgs);
+ __vmread(GUEST_FS_SELECTOR, ®s->xfs);
+ __vmread(GUEST_ES_SELECTOR, ®s->xes);
+ __vmread(GUEST_DS_SELECTOR, ®s->xds);
+}
+
+void restore_xen_regs(struct xen_regs *regs)
+{
+ __vmwrite(GUEST_SS_SELECTOR, regs->xss);
+ __vmwrite(GUEST_ESP, regs->esp);
+ __vmwrite(GUEST_EFLAGS, regs->eflags);
+ __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
+ __vmwrite(GUEST_EIP, regs->eip);
+
+ __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
+ __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
+ __vmwrite(GUEST_ES_SELECTOR, regs->xes);
+ __vmwrite(GUEST_DS_SELECTOR, regs->xds);
+}
+#endif
+
+asmlinkage void vmx_vmexit_handler(struct xen_regs regs)
+{
+ unsigned int exit_reason, idtv_info_field;
+ unsigned long exit_qualification, eip, inst_len = 0;
+ struct exec_domain *d = current;
+ int error;
+
+ if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
+ __vmx_bug(®s);
+
+ __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
+ if (idtv_info_field & INTR_INFO_VALID_MASK) {
+ __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
+ if ((idtv_info_field & 0xff) == 14) {
+ unsigned long error_code;
+
+ __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+ printk("#PG error code: %lx\n", error_code);
+ }
+ VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x\n",
+ idtv_info_field);
+ }
+
+ /* don't bother H/W interrutps */
+ if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
+ exit_reason != EXIT_REASON_VMCALL &&
+ exit_reason != EXIT_REASON_IO_INSTRUCTION)
+ VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x\n", exit_reason);
+
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+ __vmread(GUEST_EIP, &eip);
+ domain_crash();
+ return;
+ }
+
+ switch (exit_reason) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ {
+#define VECTOR_DB 1
+#define VECTOR_BP 3
+#define VECTOR_GP 13
+#define VECTOR_PG 14
+
+ /*
+ * We don't set the software-interrupt exiting (INT n).
+ * (1) We can get an exception (e.g. #PG) in the guest, or
+ * (2) NMI
+ */
+ int error;
+ unsigned int vector;
+ unsigned long va;
+ unsigned long error_code;
+
+ if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
+ && !(vector & INTR_INFO_VALID_MASK))
+ __vmx_bug(®s);
+ vector &= 0xff;
+
+ switch (vector) {
+#ifdef XEN_DEBUGGER
+ case VECTOR_DB:
+ {
+ save_xen_regs(®s);
+ pdb_handle_exception(1, ®s, 1);
+ restore_xen_regs(®s);
+ break;
+ }
+ case VECTOR_BP:
+ {
+ save_xen_regs(®s);
+ pdb_handle_exception(3, ®s, 1);
+ restore_xen_regs(®s);
+ break;
+ }
+#endif
+ case VECTOR_GP:
+ {
+ vmx_do_general_protection_fault(®s);
+ break;
+ }
+ case VECTOR_PG:
+ {
+ __vmread(EXIT_QUALIFICATION, &va);
+ __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+ VMX_DBG_LOG(DBG_LEVEL_VMMU,
+ "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n", regs.eax, regs.ebx, regs.ecx, regs.edx, regs.esi, regs.edi);
+
+ if (!(error = vmx_do_page_fault(va, error_code))) {
+ /*
+ * Inject #PG using Interruption-Information Fields
+ */
+ unsigned long intr_fields;
+
+ intr_fields = (INTR_INFO_VALID_MASK |
+ INTR_TYPE_EXCEPTION |
+ INTR_INFO_DELIEVER_CODE_MASK |
+ VECTOR_PG);
+ __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields);
+ __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ d->thread.arch_vmx.cpu_cr2 = va;
+ }
+ break;
+ }
+ default:
+ __vmx_bug(®s);
+ break;
+ }
+ break;
+ }
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ {
+ extern int vector_irq[];
+ extern asmlinkage void do_IRQ(struct xen_regs);
+ extern void smp_apic_timer_interrupt(struct xen_regs *);
+ extern void timer_interrupt(int, void *, struct xen_regs *);
+ unsigned int vector;
+
+ if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
+ && !(vector & INTR_INFO_VALID_MASK))
+ __vmx_bug(®s);
+
+ vector &= 0xff;
+ local_irq_disable();
+
+ if (vector == LOCAL_TIMER_VECTOR) {
+ smp_apic_timer_interrupt(®s);
+ } else {
+ regs.entry_vector = (vector == FIRST_DEVICE_VECTOR?
+ 0 : vector_irq[vector]);
+ do_IRQ(regs);
+ }
+ break;
+ }
+ case EXIT_REASON_PENDING_INTERRUPT:
+ __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+ MONITOR_CPU_BASED_EXEC_CONTROLS);
+ vmx_intr_assist(d);
+ break;
+ case EXIT_REASON_TASK_SWITCH:
+ __vmx_bug(®s);
+ break;
+ case EXIT_REASON_CPUID:
+ __get_instruction_length(inst_len);
+ vmx_vmexit_do_cpuid(regs.eax, ®s);
+ __update_guest_eip(inst_len);
+ break;
+ case EXIT_REASON_HLT:
+ __get_instruction_length(inst_len);
+ __update_guest_eip(inst_len);
+ vmx_vmexit_do_hlt();
+ break;
+ case EXIT_REASON_INVLPG:
+ {
+ unsigned long va;
+
+ __vmread(EXIT_QUALIFICATION, &va);
+ vmx_vmexit_do_invlpg(va);
+ __get_instruction_length(inst_len);
+ __update_guest_eip(inst_len);
+ break;
+ }
+ case EXIT_REASON_VMCALL:
+ __get_instruction_length(inst_len);
+ __vmread(GUEST_EIP, &eip);
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+ vmx_print_line(regs.eax, d); /* provides the current domain */
+ __update_guest_eip(inst_len);
+ break;
+ case EXIT_REASON_CR_ACCESS:
+ {
+ __vmread(GUEST_EIP, &eip);
+ __get_instruction_length(inst_len);
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+ VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx\n",
+ eip, inst_len, exit_qualification);
+ vmx_cr_access(exit_qualification, ®s);
+ __update_guest_eip(inst_len);
+ break;
+ }
+ case EXIT_REASON_DR_ACCESS:
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+ vmx_dr_access(exit_qualification, ®s);
+ __get_instruction_length(inst_len);
+ __update_guest_eip(inst_len);
+ break;
+ case EXIT_REASON_IO_INSTRUCTION:
+ __vmread(EXIT_QUALIFICATION, &exit_qualification);
+ __get_instruction_length(inst_len);
+ vmx_io_instruction(®s, exit_qualification, inst_len);
+ break;
+ case EXIT_REASON_MSR_READ:
+ __get_instruction_length(inst_len);
+ vmx_do_msr_read(®s);
+ __update_guest_eip(inst_len);
+ break;
+ case EXIT_REASON_MSR_WRITE:
+ __vmread(GUEST_EIP, &eip);
+ VMX_DBG_LOG(DBG_LEVEL_1, "MSR_WRITE: eip=%08lx, eax=%08x, edx=%08x",
+ eip, regs.eax, regs.edx);
+ /* just ignore this point */
+ __get_instruction_length(inst_len);
+ __update_guest_eip(inst_len);
+ break;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ __get_instruction_length(inst_len);
+ __update_guest_eip(inst_len);
+ vmx_vmexit_do_mwait();
+ break;
+ default:
+ __vmx_bug(®s); /* should not happen */
+ }
+ return;
+}
+
+asmlinkage void load_cr2(void)
+{
+ struct exec_domain *d = current;
+
+ local_irq_disable();
+ asm volatile("movl %0,%%cr2": :"r" (d->thread.arch_vmx.cpu_cr2));
+}
--- /dev/null
+/*
+ * vmx_io.c: handling I/O, interrupts related VMX entry/exit
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <xen/event.h>
+#include <public/io/ioreq.h>
+
+void vmx_io_assist(struct exec_domain *ed)
+{
+ vcpu_iodata_t *vio;
+ ioreq_t *p;
+ struct domain *d = ed->domain;
+ execution_context_t *ec = get_execution_context();
+ unsigned long old_eax;
+ extern long do_block();
+ unsigned long eflags;
+ int dir;
+
+ /* clear the pending event */
+ ed->vcpu_info->evtchn_upcall_pending = 0;
+ /* clear the pending bit for port 2 */
+ clear_bit(IOPACKET_PORT>>5, &ed->vcpu_info->evtchn_pending_sel);
+ clear_bit(IOPACKET_PORT, &d->shared_info->evtchn_pending[0]);
+
+ vio = (vcpu_iodata_t *) ed->thread.arch_vmx.vmx_platform.shared_page_va;
+ if (vio == 0) {
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "bad shared page: %lx\n", (unsigned long) vio);
+ domain_crash();
+ }
+ p = &vio->vp_ioreq;
+ /* clear IO wait VMX flag */
+ if (test_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags)) {
+ if (p->state != STATE_IORESP_READY) {
+ printk("got a false I/O reponse\n");
+ do_block();
+ } else {
+ p->state = STATE_INVALID;
+ }
+ clear_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags);
+ } else {
+ return;
+ }
+
+ __vmread(GUEST_EFLAGS, &eflags);
+ dir = (eflags & X86_EFLAGS_DF);
+
+ if (p->dir == IOREQ_WRITE) {
+ if (p->pdata_valid) {
+ if (!dir)
+ ec->esi += p->count * p->size;
+ else
+ ec->esi -= p->count * p->size;
+ ec->ecx -= p->count;
+ }
+ return;
+ } else {
+ if (p->pdata_valid) {
+ if (!dir)
+ ec->edi += p->count * p->size;
+ else
+ ec->edi -= p->count * p->size;
+ ec->ecx -= p->count;
+ return;
+ }
+ }
+
+ old_eax = ec->eax;
+
+ switch(p->size) {
+ case 1:
+ ec->eax = (old_eax & 0xffffff00) | (p->u.data & 0xff);
+ break;
+ case 2:
+ ec->eax = (old_eax & 0xffff0000) | (p->u.data & 0xffff);
+ break;
+ case 4:
+ ec->eax = (p->u.data & 0xffffffff);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline int __fls(unsigned long word)
+{
+ int bit;
+
+ __asm__("bsrl %1,%0"
+ :"=r" (bit)
+ :"rm" (word));
+ return word ? bit : -1;
+}
+
+
+/* Simple minded Local APIC priority implementation. Fix later */
+static __inline__ int find_highest_irq(unsigned long *pintr)
+{
+ if (pintr[7])
+ return __fls(pintr[7]) + (256-32*1);
+ if (pintr[6])
+ return __fls(pintr[6]) + (256-32*2);
+ if (pintr[5])
+ return __fls(pintr[5]) + (256-32*3);
+ if (pintr[4])
+ return __fls(pintr[4]) + (256-32*4);
+ if (pintr[3])
+ return __fls(pintr[3]) + (256-32*5);
+ if (pintr[2])
+ return __fls(pintr[2]) + (256-32*6);
+ if (pintr[1])
+ return __fls(pintr[1]) + (256-32*7);
+ return __fls(pintr[0]);
+}
+
+/*
+ * Return 0-255 for pending irq.
+ * -1 when no pending.
+ */
+static inline int find_highest_pending_irq(struct exec_domain *d)
+{
+ vcpu_iodata_t *vio;
+
+ vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+ if (vio == 0) {
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "bad shared page: %lx\n", (unsigned long) vio);
+ domain_crash();
+ }
+
+ return find_highest_irq(&vio->vp_intr[0]);
+}
+
+static inline void clear_highest_bit(struct exec_domain *d, int vector)
+{
+ vcpu_iodata_t *vio;
+
+ vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+ if (vio == 0) {
+ VMX_DBG_LOG(DBG_LEVEL_1,
+ "bad shared page: %lx\n", (unsigned long) vio);
+ domain_crash();
+ }
+
+ clear_bit(vector, &vio->vp_intr[0]);
+}
+
+static inline int irq_masked(unsigned long eflags)
+{
+ return ((eflags & X86_EFLAGS_IF) == 0);
+}
+
+void vmx_intr_assist(struct exec_domain *d)
+{
+ int highest_vector = find_highest_pending_irq(d);
+ unsigned long intr_fields, eflags;
+
+ if (highest_vector == -1)
+ return;
+
+ __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
+ if (intr_fields & INTR_INFO_VALID_MASK) {
+ VMX_DBG_LOG(DBG_LEVEL_1, "vmx_intr_assist: intr_fields: %lx\n",
+ intr_fields);
+ return;
+ }
+
+ __vmread(GUEST_EFLAGS, &eflags);
+ if (irq_masked(eflags)) {
+ VMX_DBG_LOG(DBG_LEVEL_1, "guesting pending: %x, eflags: %lx\n",
+ highest_vector, eflags);
+ return;
+ }
+
+ clear_highest_bit(d, highest_vector);
+ intr_fields = (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | highest_vector);
+ __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields);
+
+ __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+
+ return;
+}
+
+void vmx_do_resume(struct exec_domain *d)
+{
+ extern long do_block();
+
+ __vmwrite(HOST_CR3, pagetable_val(d->mm.monitor_table));
+ __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+ __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+
+ if (event_pending(d)) {
+ if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0]))
+ vmx_io_assist(d);
+
+ else if (test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags)) {
+ printk("got an event while blocked on I/O\n");
+ do_block();
+ }
+
+ /* Assumption: device model will not inject an interrupt
+ * while an ioreq_t is pending i.e. the response and
+ * interrupt can come together. But an interrupt without
+ * a response to ioreq_t is not ok.
+ */
+ }
+ if (!test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags))
+ vmx_intr_assist(d);
+}
--- /dev/null
+/*
+ * vmx_vmcs.c: VMCS management
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <public/io/ioreq.h>
+#include <asm/domain_page.h>
+
+struct vmcs_struct *alloc_vmcs(void)
+{
+ struct vmcs_struct *vmcs;
+ unsigned int cpu_sig = cpuid_eax(0x00000001);
+
+ vmcs = (struct vmcs_struct *) alloc_xenheap_pages(get_order(vmcs_size));
+ memset((char *) vmcs, 0, vmcs_size); /* don't remove this */
+
+ vmcs->vmcs_revision_id = (cpu_sig > 0xf41)? 3 : 1;
+ return vmcs;
+}
+
+void free_vmcs(struct vmcs_struct *vmcs)
+{
+ int order;
+
+ order = (vmcs_size >> PAGE_SHIFT) - 1;
+ free_xenheap_pages((unsigned long) vmcs, order);
+}
+
+static inline int construct_vmcs_controls(void)
+{
+ int error = 0;
+
+ error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+ MONITOR_PIN_BASED_EXEC_CONTROLS);
+
+ error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+ MONITOR_CPU_BASED_EXEC_CONTROLS);
+
+ error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
+ error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
+
+ return error;
+}
+
+#define GUEST_SEGMENT_LIMIT 0xffffffff
+#define HOST_SEGMENT_LIMIT 0xffffffff
+
+struct host_execution_env {
+ /* selectors */
+ unsigned short ldtr_selector;
+ unsigned short tr_selector;
+ unsigned short ds_selector;
+ unsigned short cs_selector;
+ /* limits */
+ unsigned short gdtr_limit;
+ unsigned short ldtr_limit;
+ unsigned short idtr_limit;
+ unsigned short tr_limit;
+ /* base */
+ unsigned long gdtr_base;
+ unsigned long ldtr_base;
+ unsigned long idtr_base;
+ unsigned long tr_base;
+ unsigned long ds_base;
+ unsigned long cs_base;
+ /* control registers */
+ unsigned long cr3;
+ unsigned long cr0;
+ unsigned long cr4;
+ unsigned long dr7;
+};
+
+#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */
+
+int vmx_setup_platform(struct exec_domain *d, execution_context_t *context)
+{
+ int i;
+ unsigned int n;
+ unsigned long *p, mpfn, offset, addr;
+ struct e820entry *e820p;
+ unsigned long gpfn = 0;
+
+ context->ebx = 0; /* Linux expects ebx to be 0 for boot proc */
+
+ n = context->ecx;
+ if (n > 32) {
+ VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d\n", n);
+ return -1;
+ }
+
+ addr = context->edi;
+ offset = (addr & ~PAGE_MASK);
+ addr = round_pgdown(addr);
+ mpfn = phys_to_machine_mapping[addr >> PAGE_SHIFT];
+ p = map_domain_mem(mpfn << PAGE_SHIFT);
+
+ e820p = (struct e820entry *) ((unsigned long) p + offset);
+
+ for (i = 0; i < n; i++) {
+ if (e820p[i].type == E820_SHARED_PAGE) {
+ gpfn = (e820p[i].addr >> PAGE_SHIFT);
+ break;
+ }
+ }
+
+ if (gpfn == 0) {
+ VMX_DBG_LOG(DBG_LEVEL_1, "No shared Page ?\n");
+ return -1;
+ }
+ unmap_domain_mem(p);
+
+ mpfn = phys_to_machine_mapping[gpfn];
+ p = map_domain_mem(mpfn << PAGE_SHIFT);
+ d->thread.arch_vmx.vmx_platform.shared_page_va = (unsigned long) p;
+
+ return 0;
+}
+
+
+/*
+ * Add <guest pfn, machine pfn> mapping to per-domain mapping. Full
+ * virtualization does not need per-domain mapping.
+ */
+static int add_mapping_perdomain(struct exec_domain *d, unsigned long gpfn,
+ unsigned long mpfn)
+{
+ struct pfn_info *page;
+ unsigned long pfn = 0;
+
+ /*
+ * We support up to 4GB memory for a guest at this point
+ */
+ if (gpfn > ENTRIES_PER_L2_PAGETABLE * ENTRIES_PER_L1_PAGETABLE)
+ return -1;
+
+ if (!(l1_pgentry_val(d->domain->mm_perdomain_pt[
+ gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]) & _PAGE_PRESENT))
+ {
+ page = (struct pfn_info *) alloc_domheap_page(NULL);
+ if (!page) {
+ return -1;
+ }
+
+ pfn = (unsigned long) (page - frame_table);
+ d->domain->mm_perdomain_pt[gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)] =
+ mk_l1_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ }
+ phys_to_machine_mapping[gpfn] = mpfn;
+
+ return 0;
+}
+
+void vmx_do_launch(struct exec_domain *ed)
+{
+/* Update CR3, GDT, LDT, TR */
+ unsigned int tr, cpu, error = 0;
+ struct host_execution_env host_env;
+ struct Xgt_desc_struct desc;
+ struct list_head *list_ent;
+ l2_pgentry_t *mpl2e, *guest_pl2e_cache;
+ unsigned long i, pfn = 0;
+ struct pfn_info *page;
+ execution_context_t *ec = get_execution_context();
+ struct domain *d = ed->domain;
+
+ cpu = smp_processor_id();
+ ed->mm.min_pfn = ed->mm.max_pfn = 0;
+
+ spin_lock(&d->page_alloc_lock);
+ list_ent = d->page_list.next;
+
+ mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(ed->mm.monitor_table));
+ ASSERT(mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]);
+
+ for (i = 0; list_ent != &d->page_list; i++ ) {
+ pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+ ed->mm.min_pfn = min(ed->mm.min_pfn, pfn);
+ ed->mm.max_pfn = max(ed->mm.max_pfn, pfn);
+ list_ent = frame_table[pfn].list.next;
+ add_mapping_perdomain(ed, i, pfn);
+ }
+
+ spin_unlock(&d->page_alloc_lock);
+
+ page = (struct pfn_info *) alloc_domheap_page(NULL);
+ pfn = (unsigned long) (page - frame_table);
+
+ /*
+ * make linear_pt_table work for guest ptes
+ */
+ mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((pfn << PAGE_SHIFT)| __PAGE_HYPERVISOR);
+
+ guest_pl2e_cache = map_domain_mem(pfn << PAGE_SHIFT);
+ memset(guest_pl2e_cache, 0, PAGE_SIZE); /* clean it up */
+ ed->mm.guest_pl2e_cache = guest_pl2e_cache;
+
+ unmap_domain_mem(mpl2e);
+
+ vmx_setup_platform(ed, ec);
+
+ __asm__ __volatile__ ("sgdt (%%eax) \n" :: "a"(&desc) : "memory");
+ host_env.gdtr_limit = desc.size;
+ host_env.gdtr_base = desc.address;
+
+ error |= __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base);
+
+ error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
+ error |= __vmwrite(GUEST_LDTR_BASE, 0);
+ error |= __vmwrite(GUEST_LDTR_LIMIT, 0);
+
+ __asm__ __volatile__ ("str (%%eax) \n" :: "a"(&tr) : "memory");
+ host_env.tr_selector = tr;
+ host_env.tr_limit = sizeof(struct tss_struct);
+ host_env.tr_base = (unsigned long) &init_tss[cpu];
+
+ error |= __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector);
+ error |= __vmwrite(HOST_TR_BASE, host_env.tr_base);
+ error |= __vmwrite(GUEST_TR_BASE, 0);
+ error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
+
+ ed->mm.shadow_table = ed->mm.pagetable;
+ __vmwrite(GUEST_CR3, pagetable_val(ed->mm.pagetable));
+ __vmwrite(HOST_CR3, pagetable_val(ed->mm.monitor_table));
+ __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+
+ ed->thread.schedule_tail = arch_vmx_do_resume;
+}
+
+/*
+ * Initially set the same environement as host.
+ */
+static inline int
+construct_init_vmcs_guest(execution_context_t *context,
+ full_execution_context_t *full_context,
+ struct host_execution_env *host_env)
+{
+ int error = 0;
+ union vmcs_arbytes arbytes;
+ unsigned long dr7;
+ unsigned long eflags, shadow_cr;
+
+ /* MSR */
+ error |= __vmwrite(VM_EXIT_MSR_LOAD_ADDR, 0);
+ error |= __vmwrite(VM_EXIT_MSR_STORE_ADDR, 0);
+
+ error |= __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+ error |= __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+ error |= __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ /* interrupt */
+ error |= __vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+ /* mask */
+ error |= __vmwrite(CR0_GUEST_HOST_MASK, 0xffffffff);
+ error |= __vmwrite(CR4_GUEST_HOST_MASK, 0xffffffff);
+
+ error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+
+ /* TSC */
+ error |= __vmwrite(TSC_OFFSET, 0);
+ error |= __vmwrite(CR3_TARGET_COUNT, 0);
+
+ /* Guest Selectors */
+ error |= __vmwrite(GUEST_CS_SELECTOR, context->cs);
+ error |= __vmwrite(GUEST_ES_SELECTOR, context->es);
+ error |= __vmwrite(GUEST_SS_SELECTOR, context->ss);
+ error |= __vmwrite(GUEST_DS_SELECTOR, context->ds);
+ error |= __vmwrite(GUEST_FS_SELECTOR, context->fs);
+ error |= __vmwrite(GUEST_GS_SELECTOR, context->gs);
+
+ /* Guest segment Limits */
+ error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
+ error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT);
+ error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT);
+ error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT);
+ error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT);
+ error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT);
+
+ error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit);
+
+ /* AR bytes */
+ arbytes.bytes = 0;
+ arbytes.fields.seg_type = 0x3; /* type = 3 */
+ arbytes.fields.s = 1; /* code or data, i.e. not system */
+ arbytes.fields.dpl = 0; /* DPL = 3 */
+ arbytes.fields.p = 1; /* segment present */
+ arbytes.fields.default_ops_size = 1; /* 32-bit */
+ arbytes.fields.g = 1;
+ arbytes.fields.null_bit = 0; /* not null */
+
+ error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes);
+ error |= __vmwrite(GUEST_SS_AR_BYTES, arbytes.bytes);
+ error |= __vmwrite(GUEST_DS_AR_BYTES, arbytes.bytes);
+ error |= __vmwrite(GUEST_FS_AR_BYTES, arbytes.bytes);
+ error |= __vmwrite(GUEST_GS_AR_BYTES, arbytes.bytes);
+
+ arbytes.fields.seg_type = 0xb; /* type = 0xb */
+ error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes);
+
+ error |= __vmwrite(GUEST_GDTR_BASE, context->edx);
+ context->edx = 0;
+ error |= __vmwrite(GUEST_GDTR_LIMIT, context->eax);
+ context->eax = 0;
+
+ arbytes.fields.s = 0; /* not code or data segement */
+ arbytes.fields.seg_type = 0x2; /* LTD */
+ arbytes.fields.default_ops_size = 0; /* 16-bit */
+ arbytes.fields.g = 0;
+ error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes);
+
+ arbytes.fields.seg_type = 0xb; /* 32-bit TSS (busy) */
+ error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes);
+
+ error |= __vmwrite(GUEST_CR0, host_env->cr0); /* same CR0 */
+
+ /* Initally PG, PE are not set*/
+ shadow_cr = host_env->cr0;
+ shadow_cr &= ~(X86_CR0_PE | X86_CR0_PG);
+ error |= __vmwrite(CR0_READ_SHADOW, shadow_cr);
+ /* CR3 is set in vmx_final_setup_guestos */
+ error |= __vmwrite(GUEST_CR4, host_env->cr4);
+ shadow_cr = host_env->cr4;
+ shadow_cr &= ~(X86_CR4_PGE | X86_CR4_VMXE);
+ error |= __vmwrite(CR4_READ_SHADOW, shadow_cr);
+
+ error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base);
+ error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base);
+ error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base);
+ error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base);
+ error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base);
+ error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base);
+ error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base);
+
+ error |= __vmwrite(GUEST_ESP, context->esp);
+ error |= __vmwrite(GUEST_EIP, context->eip);
+
+ eflags = context->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */
+ eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */
+
+ error |= __vmwrite(GUEST_EFLAGS, eflags);
+
+ error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+ __asm__ __volatile__ ("mov %%dr7, %0\n" : "=r" (dr7));
+ error |= __vmwrite(GUEST_DR7, dr7);
+ error |= __vmwrite(GUEST_VMCS0, 0xffffffff);
+ error |= __vmwrite(GUEST_VMCS1, 0xffffffff);
+
+ return error;
+}
+
+static inline int construct_vmcs_host(struct host_execution_env *host_env)
+{
+ int error = 0;
+ unsigned long crn;
+ struct Xgt_desc_struct desc;
+
+ /* Host Selectors */
+ host_env->ds_selector = __HYPERVISOR_DS;
+ error |= __vmwrite(HOST_ES_SELECTOR, host_env->ds_selector);
+ error |= __vmwrite(HOST_SS_SELECTOR, host_env->ds_selector);
+ error |= __vmwrite(HOST_DS_SELECTOR, host_env->ds_selector);
+ error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector);
+ error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector);
+
+ host_env->cs_selector = __HYPERVISOR_CS;
+ error |= __vmwrite(HOST_CS_SELECTOR, host_env->cs_selector);
+
+ host_env->ds_base = 0;
+ host_env->cs_base = 0;
+ error |= __vmwrite(HOST_FS_BASE, host_env->ds_base);
+ error |= __vmwrite(HOST_GS_BASE, host_env->ds_base);
+
+/* Debug */
+ __asm__ __volatile__ ("sidt (%%eax) \n" :: "a"(&desc) : "memory");
+ host_env->idtr_limit = desc.size;
+ host_env->idtr_base = desc.address;
+ error |= __vmwrite(HOST_IDTR_BASE, host_env->idtr_base);
+
+ __asm__ __volatile__ ("movl %%cr0,%0" : "=r" (crn) : );
+ host_env->cr0 = crn;
+ error |= __vmwrite(HOST_CR0, crn); /* same CR0 */
+
+ /* CR3 is set in vmx_final_setup_hostos */
+ __asm__ __volatile__ ("movl %%cr4,%0" : "=r" (crn) : );
+ host_env->cr4 = crn;
+ error |= __vmwrite(HOST_CR4, crn);
+ error |= __vmwrite(HOST_EIP, (unsigned long) vmx_asm_vmexit_handler);
+
+ return error;
+}
+
+/*
+ * Need to extend to support full virtualization.
+ * The variable use_host_env indicates if the new VMCS needs to use
+ * the same setups as the host has (xenolinux).
+ */
+
+int construct_vmcs(struct arch_vmx_struct *arch_vmx,
+ execution_context_t *context,
+ full_execution_context_t *full_context,
+ int use_host_env)
+{
+ int error;
+ u64 vmcs_phys_ptr;
+
+ struct host_execution_env host_env;
+
+ if (use_host_env != VMCS_USE_HOST_ENV)
+ return -EINVAL;
+
+ memset(&host_env, 0, sizeof(struct host_execution_env));
+
+ vmcs_phys_ptr = (u64) virt_to_phys(arch_vmx->vmcs);
+
+ if ((error = __vmpclear (vmcs_phys_ptr))) {
+ printk("construct_vmcs: VMCLEAR failed\n");
+ return -EINVAL;
+ }
+ if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
+ printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n",
+ (unsigned long) vmcs_phys_ptr);
+ return -EINVAL;
+ }
+ if ((error = construct_vmcs_controls())) {
+ printk("construct_vmcs: construct_vmcs_controls failed\n");
+ return -EINVAL;
+ }
+ /* host selectors */
+ if ((error = construct_vmcs_host(&host_env))) {
+ printk("construct_vmcs: construct_vmcs_host failed\n");
+ return -EINVAL;
+ }
+ /* guest selectors */
+ if ((error = construct_init_vmcs_guest(context, full_context, &host_env))) {
+ printk("construct_vmcs: construct_vmcs_guest failed\n");
+ return -EINVAL;
+ }
+
+ if ((error |= __vmwrite(EXCEPTION_BITMAP,
+ MONITOR_DEFAULT_EXCEPTION_BITMAP))) {
+ printk("construct_vmcs: setting Exception bitmap failed\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
+{
+ int error;
+
+ if ((error = __vmptrld(phys_ptr))) {
+ clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
+ return error;
+ }
+ set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
+ return 0;
+}
+
+int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
+{
+ /* take the current VMCS */
+ __vmptrst(phys_ptr);
+ clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
+ return 0;
+}
+
+void vm_launch_fail(unsigned long eflags)
+{
+ BUG();
+}
+
+void vm_resume_fail(unsigned long eflags)
+{
+ BUG();
+}
+
andl $~3,reg; \
movl (reg),reg;
+#ifdef CONFIG_VMX
+/*
+ * At VMExit time the processor saves the guest selectors, esp, eip,
+ * and eflags. Therefore we don't save them, but simply decrement
+ * the kernel stack pointer to make it consistent with the stack frame
+ * at usual interruption time. The eflags of the host is not saved by VMX,
+ * and we set it to the fixed value.
+ *
+ * We also need the room, especially because orig_eax field is used
+ * by do_IRQ(). Compared the xen_regs, we skip pushing for the following:
+ * (1/1) u16 error_code;
+ * (2/1) u16 entry_vector;
+ * (2) u32 eip;
+ * (3) u32 cs;
+ * (4) u32 eflags;
+ */
+#define VMX_MONITOR_EFLAGS 0x202 /* IF on */
+#define NR_SKIPPED_REGS 4 /* See the above explanation */
+#define VMX_SAVE_ALL_NOSEGREGS \
+ pushl $VMX_MONITOR_EFLAGS; \
+ popf; \
+ subl $(NR_SKIPPED_REGS*4), %esp; \
+ pushl %eax; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx;
+
+ENTRY(vmx_asm_vmexit_handler)
+ /* selectors are restored/saved by VMX */
+ VMX_SAVE_ALL_NOSEGREGS
+ call SYMBOL_NAME(vmx_vmexit_handler)
+ jmp vmx_asm_do_resume
+
+ENTRY(vmx_asm_do_launch)
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ addl $(NR_SKIPPED_REGS*4), %esp
+ /* VMLUANCH */
+ .byte 0x0f,0x01,0xc2
+ pushf
+ call SYMBOL_NAME(vm_launch_fail)
+ hlt
+
+ ALIGN
+
+ENTRY(vmx_asm_do_resume)
+vmx_test_all_events:
+ GET_CURRENT(%ebx)
+/* test_all_events: */
+ xorl %ecx,%ecx
+ notl %ecx
+ cli # tests must not race interrupts
+/*test_softirqs:*/
+ movl EDOMAIN_processor(%ebx),%eax
+ shl $6,%eax # sizeof(irq_cpustat) == 64
+ test %ecx,SYMBOL_NAME(irq_stat)(%eax,1)
+ jnz vmx_process_softirqs
+
+vmx_restore_all_guest:
+ call SYMBOL_NAME(load_cr2)
+ /*
+ * Check if we are going back to VMX-based VM
+ * By this time, all the setups in the VMCS must be complete.
+ */
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ addl $(NR_SKIPPED_REGS*4), %esp
+ /* VMRESUME */
+ .byte 0x0f,0x01,0xc3
+ pushf
+ call SYMBOL_NAME(vm_resume_fail)
+ /* Should never reach here */
+ hlt
+
+ ALIGN
+vmx_process_softirqs:
+ sti
+ call SYMBOL_NAME(do_softirq)
+ jmp vmx_test_all_events
+#endif
+
+ENTRY(continue_nonidle_task)
+ GET_CURRENT(%ebx)
+ jmp test_all_events
+
ALIGN
restore_all_guest:
testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
}
-static long evtchn_send(int lport)
+long evtchn_send(int lport)
{
struct domain *ld = current->domain;
struct exec_domain *rd;
struct domain *dom0;
vm_assist_info_t vm_assist_info[MAX_VMASST_TYPE + 1];
-
+#if 0
struct e820entry {
unsigned long addr_lo, addr_hi; /* start of memory segment */
unsigned long size_lo, size_hi; /* size of memory segment */
unsigned long type; /* type of memory segment */
};
-
+#endif
void start_of_day(void);
/* opt_console: comma-separated list of console outputs. */
#include <xen/init.h>
#include <xen/mm.h>
#include <xen/sched.h>
+#include <asm/vmx_vmcs.h>
#include <xen/softirq.h>
irq_cpustat_t irq_stat[NR_CPUS];
#ifndef __XEN_I386_CONFIG_H__
#define __XEN_I386_CONFIG_H__
+#define CONFIG_VMX 1
#define CONFIG_X86 1
#define CONFIG_SMP 1
#define X86_FEATURE_P4 (3*32+ 7) /* P4 */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */
+#define X86_FEATURE_VMXE (4*32+ 5) /* Virtual Machine Extensions */
#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
--- /dev/null
+/*
+ * structures and definitions for the int 15, ax=e820 memory map
+ * scheme.
+ *
+ * In a nutshell, arch/i386/boot/setup.S populates a scratch table
+ * in the empty_zero_block that contains a list of usable address/size
+ * duples. In arch/i386/kernel/setup.c, this information is
+ * transferred into the e820map, and in arch/i386/mm/init.c, that
+ * new information is used to mark pages reserved or not.
+ *
+ */
+#ifndef __E820_HEADER
+#define __E820_HEADER
+
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 32 /* number of entries in E820MAP */
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
+#define E820_NVS 4
+#define E820_IO 16
+#define E820_SHARED_PAGE 17
+
+#define HIGH_MEMORY (1024*1024)
+
+#ifndef __ASSEMBLY__
+
+struct e820map {
+ int nr_map;
+ struct e820entry {
+ unsigned long long addr; /* start of memory segment */
+ unsigned long long size; /* size of memory segment */
+ unsigned long type; /* type of memory segment */
+ } map[E820MAX];
+};
+
+extern struct e820map e820;
+#endif/*!__ASSEMBLY__*/
+
+#endif/*__E820_HEADER*/
* contiguous (or near contiguous) physical memory.
*/
#undef machine_to_phys_mapping
+/*
+ * The phys_to_machine_mapping is the reversed mapping of MPT for full
+ * virtualization.
+ */
+#undef phys_to_machine_mapping
+
#ifdef __x86_64__
extern unsigned long *machine_to_phys_mapping;
#else
#define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START)
+#ifdef CONFIG_VMX
+#define phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START)
+#endif
#endif
#define DEFAULT_GDT_ENTRIES (LAST_RESERVED_GDT_ENTRY+1)
#define MSR_MTRRcap 0x0fe
#define MSR_IA32_BBL_CR_CTL 0x119
+#define MSR_IA32_SYSENTER_CS 0x174
+#define MSR_IA32_SYSENTER_ESP 0x175
+#define MSR_IA32_SYSENTER_EIP 0x176
+
#define MSR_IA32_MCG_CAP 0x179
#define MSR_IA32_MCG_STATUS 0x17a
#define MSR_IA32_MCG_CTL 0x17b
#include <asm/pdb.h>
#include <xen/config.h>
#include <xen/spinlock.h>
+#include <asm/vmx_vmcs.h>
#include <public/xen.h>
#endif
#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE 0x2000 /* enable VMX */
/*
* Trap/fault mnemonics.
struct desc_struct fast_trap_desc;
#endif
trap_info_t traps[256];
+#ifdef CONFIG_VMX
+ struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
+#endif
};
#define IDT_ENTRIES 256
l1_pgentry_t *perdomain_ptes;
pagetable_t pagetable;
+#ifdef CONFIG_VMX
+
+#define SHM_full_32 (8) /* full virtualization for 32-bit */
+
+ pagetable_t monitor_table;
+ l2_pgentry_t *vpagetable; /* virtual address of pagetable */
+ l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */
+ l2_pgentry_t *guest_pl2e_cache; /* guest page directory cache */
+ unsigned long min_pfn; /* min host physical */
+ unsigned long max_pfn; /* max host physical */
+#endif
+
/* shadow mode status and controls */
unsigned int shadow_mode; /* flags to control shadow table operation */
pagetable_t shadow_table;
char gdt[10]; /* NB. 10 bytes needed for x86_64. Use 6 bytes for x86_32. */
};
+#define SHM_full_32 (8) /* full virtualization for 32-bit */
+
static inline void write_ptbase(struct mm_struct *mm)
{
unsigned long pa;
+#ifdef CONFIG_VMX
+ if ( unlikely(mm->shadow_mode) ) {
+ if (mm->shadow_mode == SHM_full_32)
+ pa = pagetable_val(mm->monitor_table);
+ else
+ pa = pagetable_val(mm->shadow_table);
+ }
+#else
if ( unlikely(mm->shadow_mode) )
- pa = pagetable_val(mm->shadow_table);
+ pa = pagetable_val(mm->shadow_table);
+#endif
else
- pa = pagetable_val(mm->pagetable);
+ pa = pagetable_val(mm->pagetable);
write_cr3(pa);
}
long set_debugreg(struct exec_domain *p, int reg, unsigned long value);
+struct microcode_header {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
struct microcode {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int reserved[5];
- unsigned int bits[500];
+ struct microcode_header hdr;
+ unsigned int bits[0];
};
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
#define MICROCODE_IOCFREE _IO('6',0)
#define SHM_logdirty (2) /* log pages that are dirtied */
#define SHM_translate (3) /* lookup machine pages in translation table */
#define SHM_cow (4) /* copy on write all dirtied pages */
+#define SHM_full_32 (8) /* full virtualization for 32-bit */
#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
extern void unshadow_table(unsigned long gpfn, unsigned int type);
extern int shadow_mode_enable(struct domain *p, unsigned int mode);
+#ifdef CONFIG_VMX
+extern void vmx_shadow_clear_state(struct mm_struct *);
+extern void vmx_shadow_invlpg(struct mm_struct *, unsigned long);
+#endif
+
+#define __get_machine_to_phys(m, guest_gpfn, gpfn) \
+ if ((m)->shadow_mode == SHM_full_32) \
+ (guest_gpfn) = machine_to_phys_mapping[(gpfn)]; \
+ else \
+ (guest_gpfn) = (gpfn);
+
+#define __get_phys_to_machine(m, host_gpfn, gpfn) \
+ if ((m)->shadow_mode == SHM_full_32) \
+ (host_gpfn) = phys_to_machine_mapping[(gpfn)]; \
+ else \
+ (host_gpfn) = (gpfn);
+
extern void __shadow_mode_disable(struct domain *d);
static inline void shadow_mode_disable(struct domain *d)
{
extern unsigned long shadow_l2_table(
struct mm_struct *m, unsigned long gpfn);
+
+static inline void shadow_invalidate(struct mm_struct *m) {
+ if (m->shadow_mode != SHM_full_32)
+ BUG();
+ memset(m->shadow_vtable, 0, PAGE_SIZE);
+}
-#define SHADOW_DEBUG 0
+#define SHADOW_DEBUG 0
#define SHADOW_HASH_DEBUG 0
struct shadow_status {
printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
current->id , __LINE__ , ## _a )
#else
-#define SH_VVLOG(_f, _a...)
+#define SH_VVLOG(_f, _a...)
#endif
+static inline void __shadow_get_pl2e(struct mm_struct *m,
+ unsigned long va, unsigned long *sl2e)
+{
+ if (m->shadow_mode == SHM_full_32) {
+ *sl2e = l2_pgentry_val(m->shadow_vtable[va >> L2_PAGETABLE_SHIFT]);
+ }
+ else
+ *sl2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+}
+
+static inline void __shadow_set_pl2e(struct mm_struct *m,
+ unsigned long va, unsigned long value)
+{
+ if (m->shadow_mode == SHM_full_32) {
+ m->shadow_vtable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+ }
+ else
+ linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+}
+
+static inline void __guest_get_pl2e(struct mm_struct *m,
+ unsigned long va, unsigned long *l2e)
+{
+ if (m->shadow_mode == SHM_full_32) {
+ *l2e = l2_pgentry_val(m->vpagetable[va >> L2_PAGETABLE_SHIFT]);
+ }
+ else
+ *l2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+}
+
+static inline void __guest_set_pl2e(struct mm_struct *m,
+ unsigned long va, unsigned long value)
+{
+ if (m->shadow_mode == SHM_full_32) {
+ unsigned long pfn;
+
+ pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+ m->guest_pl2e_cache[va >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+ m->vpagetable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+ }
+ else
+ linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+
+}
/************************************************************************/
unsigned long spte = *spte_p;
ASSERT(gpte & _PAGE_RW);
-
gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
switch ( m->shadow_mode )
case SHM_logdirty:
spte = gpte | _PAGE_RW;
__mark_dirty(m, gpte >> PAGE_SHIFT);
+
+ case SHM_full_32:
+ {
+ unsigned long host_pfn, host_gpte;
+
+ host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+ host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+ spte = host_gpte | _PAGE_RW;
+ }
break;
}
+ SH_VVLOG("updating spte=%lx gpte=%lx", spte, gpte);
*gpte_p = gpte;
*spte_p = spte;
}
case SHM_logdirty:
spte = gpte & ~_PAGE_RW;
break;
+
+ case SHM_full_32:
+ {
+ unsigned long host_pfn, host_gpte;
+
+ host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+ host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+ spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW);
+ }
+ break;
+
}
*gpte_p = gpte;
(_PAGE_PRESENT|_PAGE_ACCESSED) )
spte = gpte & ~_PAGE_RW;
break;
+
+ case SHM_full_32:
+ {
+ unsigned long host_pfn, host_gpte;
+
+ host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+ host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+ spte = 0;
+
+ if ( (host_gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
+ (_PAGE_PRESENT|_PAGE_ACCESSED) )
+ spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW);
+ }
+ break;
}
*gpte_p = gpte;
/* Detect linear p.t. mappings and write-protect them. */
if ( (frame_table[sl1pfn].u.inuse.type_info & PGT_type_mask) ==
- PGT_l2_page_table )
- spde = gpde & ~_PAGE_RW;
+ PGT_l2_page_table )
+ {
+ if (m->shadow_mode != SHM_full_32)
+ spde = gpde & ~_PAGE_RW;
+
+ }
}
*gpde_p = gpde;
head = hash_bucket(m, gpfn);
- SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b);
+ SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head);
shadow_audit(m, 0);
/* Match on head item? */
x = head = hash_bucket(m, gpfn);
- SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next);
+ SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, x, x->next);
shadow_audit(m, 0);
/*
done:
shadow_audit(m, 0);
}
+
+#ifdef CONFIG_VMX
+#include <asm/domain_page.h>
+
+static inline void vmx_update_shadow_state(
+ struct mm_struct *mm, unsigned long gpfn, unsigned long spfn)
+{
+
+ l2_pgentry_t *mpl2e = 0;
+ l2_pgentry_t *gpl2e, *spl2e;
+
+ /* unmap the old mappings */
+ if (mm->shadow_vtable)
+ unmap_domain_mem(mm->shadow_vtable);
+ if (mm->vpagetable)
+ unmap_domain_mem(mm->vpagetable);
+
+ /* new mapping */
+ mpl2e = (l2_pgentry_t *)
+ map_domain_mem(pagetable_val(mm->monitor_table));
+
+ mpl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ __flush_tlb_one(SH_LINEAR_PT_VIRT_START);
+
+ spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
+ gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+ memset(spl2e, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+ mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+ mm->shadow_vtable = spl2e;
+ mm->vpagetable = gpl2e; /* expect the guest did clean this up */
+ unmap_domain_mem(mpl2e);
+}
+
+static inline void __shadow_mk_pagetable( struct mm_struct *mm )
+{
+ unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
+ unsigned long spfn;
+ SH_VLOG("0: __shadow_mk_pagetable(gpfn=%08lx\n", gpfn);
+
+ if (mm->shadow_mode == SHM_full_32)
+ {
+ unsigned long guest_gpfn;
+ guest_gpfn = machine_to_phys_mapping[gpfn];
+
+ SH_VVLOG("__shadow_mk_pagetable(guest_gpfn=%08lx, gpfn=%08lx\n",
+ guest_gpfn, gpfn);
+
+ spfn = __shadow_status(mm, gpfn) & PSH_pfn_mask;
+ if ( unlikely(spfn == 0) ) {
+ spfn = shadow_l2_table(mm, gpfn);
+ mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+ } else {
+ vmx_update_shadow_state(mm, gpfn, spfn);
+ }
+ } else {
+ spfn = __shadow_status(mm, gpfn) & PSH_pfn_mask;
+
+ if ( unlikely(spfn == 0) ) {
+ spfn = shadow_l2_table(mm, gpfn);
+ }
+ mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+ }
+}
+#else
static inline void __shadow_mk_pagetable(struct mm_struct *mm)
{
unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
mm->shadow_table = mk_pagetable(spfn << PAGE_SHIFT);
}
+#endif /* CONFIG_VMX */
static inline void shadow_mk_pagetable(struct mm_struct *mm)
{
- SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
- pagetable_val(mm->pagetable), mm->shadow_mode );
-
- if ( unlikely(mm->shadow_mode) )
- {
- shadow_lock(mm);
- __shadow_mk_pagetable(mm);
- shadow_unlock(mm);
- }
-
- SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
- pagetable_val(mm->pagetable), mm->shadow_mode,
- pagetable_val(mm->shadow_table) );
+ if ( unlikely(mm->shadow_mode) )
+ {
+ SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
+ pagetable_val(mm->pagetable), mm->shadow_mode );
+
+ shadow_lock(mm);
+ __shadow_mk_pagetable(mm);
+ shadow_unlock(mm);
+
+ SH_VVLOG("leaving shadow_mk_pagetable:\n");
+
+ SH_VVLOG("( gptbase=%08lx, mode=%d ) sh=%08lx",
+ pagetable_val(mm->pagetable), mm->shadow_mode,
+ pagetable_val(mm->shadow_table) );
+
+ }
}
#if SHADOW_DEBUG
--- /dev/null
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_H__
+#define __ASM_X86_VMX_H__
+
+#include <xen/sched.h>
+#include <asm/types.h>
+#include <asm/regs.h>
+#include <asm/processor.h>
+#include <asm/vmx_vmcs.h>
+
+extern void vmx_asm_vmexit_handler(struct xen_regs);
+extern void vmx_asm_do_resume(void);
+extern void vmx_asm_do_launch(void);
+extern void vmx_intr_assist(struct exec_domain *d);
+
+extern void arch_vmx_do_launch(struct exec_domain *);
+extern void arch_vmx_do_resume(struct exec_domain *);
+
+extern int vmcs_size;
+extern unsigned int cpu_rev;
+
+/*
+ * Need fill bits for SENTER
+ */
+
+#define MONITOR_PIN_BASED_EXEC_CONTROLS 0x0000001f
+#define MONITOR_CPU_BASED_EXEC_CONTROLS 0x0581e7f2
+#define MONITOR_VM_EXIT_CONTROLS 0x0003edff
+#define MONITOR_VM_ENTRY_CONTROLS 0x000011ff
+
+/*
+ * Exit Reasons
+ */
+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+
+#define EXIT_REASON_PENDING_INTERRUPT 7
+
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
+#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
+#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
+
+#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
+#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
+#define TYPE_MOV_TO_CR (0 << 4)
+#define TYPE_MOV_FROM_CR (1 << 4)
+#define TYPE_CLTS (2 << 4)
+#define CONTROL_REG_ACCESS_REG 0x700 /* 10:8, general purpose register */
+#define REG_EAX (0 << 8)
+#define REG_ECX (1 << 8)
+#define REG_EDX (2 << 8)
+#define REG_EBX (3 << 8)
+#define REG_ESP (4 << 8)
+#define REG_EBP (5 << 8)
+#define REG_ESI (6 << 8)
+#define REG_EDI (7 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
+#define TYPE_MOV_TO_DR (0 << 4)
+#define TYPE_MOV_FROM_DR (1 << 4)
+#define DEBUG_REG_ACCESS_REG 0x700 /* 11:8, general purpose register */
+
+#define EXCEPTION_BITMAP_DE (1 << 0) /* Divide Error */
+#define EXCEPTION_BITMAP_DB (1 << 1) /* Debug */
+#define EXCEPTION_BITMAP_NMI (1 << 2) /* NMI */
+#define EXCEPTION_BITMAP_BP (1 << 3) /* Breakpoint */
+#define EXCEPTION_BITMAP_OF (1 << 4) /* Overflow */
+#define EXCEPTION_BITMAP_BR (1 << 5) /* BOUND Range Exceeded */
+#define EXCEPTION_BITMAP_UD (1 << 6) /* Invalid Opcode */
+#define EXCEPTION_BITMAP_NM (1 << 7) /* Device Not Available */
+#define EXCEPTION_BITMAP_DF (1 << 8) /* Double Fault */
+/* reserved */
+#define EXCEPTION_BITMAP_TS (1 << 10) /* Invalid TSS */
+#define EXCEPTION_BITMAP_NP (1 << 11) /* Segment Not Present */
+#define EXCEPTION_BITMAP_SS (1 << 12) /* Stack-Segment Fault */
+#define EXCEPTION_BITMAP_GP (1 << 13) /* General Protection */
+#define EXCEPTION_BITMAP_PG (1 << 14) /* Page Fault */
+#define EXCEPTION_BITMAP_MF (1 << 16) /* x87 FPU Floating-Point Error (Math Fault) */
+#define EXCEPTION_BITMAP_AC (1 << 17) /* Alignment Check */
+#define EXCEPTION_BITMAP_MC (1 << 18) /* Machine Check */
+#define EXCEPTION_BITMAP_XF (1 << 19) /* SIMD Floating-Point Exception */
+
+#ifdef XEN_DEBUGGER
+#define MONITOR_DEFAULT_EXCEPTION_BITMAP \
+ ( EXCEPTION_BITMAP_PG | \
+ EXCEPTION_BITMAP_DB | \
+ EXCEPTION_BITMAP_BP | \
+ EXCEPTION_BITMAP_GP )
+#else
+#define MONITOR_DEFAULT_EXCEPTION_BITMAP \
+ ( EXCEPTION_BITMAP_PG | \
+ EXCEPTION_BITMAP_GP )
+#endif
+
+#define VMCALL_OPCODE ".byte 0x0f,0x01,0xc1\n"
+#define VMCLEAR_OPCODE ".byte 0x66,0x0f,0xc7\n" /* reg/opcode: /6 */
+#define VMLAUNCH_OPCODE ".byte 0x0f,0x01,0xc2\n"
+#define VMPTRLD_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /6 */
+#define VMPTRST_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /7 */
+#define VMREAD_OPCODE ".byte 0x0f,0x78\n"
+#define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
+#define VMWRITE_OPCODE ".byte 0x0f,0x79\n"
+#define VMXOFF_OPCODE ".byte 0x0f,0x01,0xc4\n"
+#define VMXON_OPCODE ".byte 0xf3,0x0f,0xc7\n"
+
+#define MODRM_EAX_06 ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
+#define MODRM_EAX_07 ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
+#define MODRM_EAX_ECX ".byte 0xc1\n" /* [EAX], [ECX] */
+
+static inline int __vmptrld (u64 addr)
+{
+ unsigned long eflags;
+ __asm__ __volatile__ ( VMPTRLD_OPCODE
+ MODRM_EAX_06
+ :
+ : "a" (&addr)
+ : "memory");
+
+ __save_flags(eflags);
+ if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+ return -1;
+ return 0;
+}
+
+static inline void __vmptrst (u64 addr)
+{
+ __asm__ __volatile__ ( VMPTRST_OPCODE
+ MODRM_EAX_07
+ :
+ : "a" (&addr)
+ : "memory");
+}
+
+static inline int __vmpclear (u64 addr)
+{
+ unsigned long eflags;
+
+ __asm__ __volatile__ ( VMCLEAR_OPCODE
+ MODRM_EAX_06
+ :
+ : "a" (&addr)
+ : "memory");
+ __save_flags(eflags);
+ if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+ return -1;
+ return 0;
+}
+
+static inline int __vmread (unsigned int field, void *value)
+{
+ unsigned long eflags;
+ unsigned long ecx = 0;
+
+ __asm__ __volatile__ ( VMREAD_OPCODE
+ MODRM_EAX_ECX
+ : "=c" (ecx)
+ : "a" (field)
+ : "memory");
+
+ *((long *) value) = ecx;
+
+ __save_flags(eflags);
+ if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+ return -1;
+ return 0;
+}
+
+static inline int __vmwrite (unsigned int field, unsigned int value)
+{
+ unsigned long eflags;
+
+ __asm__ __volatile__ ( VMWRITE_OPCODE
+ MODRM_EAX_ECX
+ :
+ : "a" (field) , "c" (value)
+ : "memory");
+ __save_flags(eflags);
+ if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+ return -1;
+ return 0;
+}
+
+static inline void __vmxoff (void)
+{
+ __asm__ __volatile__ ( VMXOFF_OPCODE
+ ::: "memory");
+}
+
+static inline int __vmxon (u64 addr)
+{
+ unsigned long eflags;
+
+ __asm__ __volatile__ ( VMXON_OPCODE
+ MODRM_EAX_06
+ :
+ : "a" (&addr)
+ : "memory");
+ __save_flags(eflags);
+ if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+ return -1;
+ return 0;
+}
+#endif /* __ASM_X86_VMX_H__ */
--- /dev/null
+/*
+ * vmx_cpu.h: Virtual CPU state
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_VMCS_H__
+#define __ASM_X86_VMX_VMCS_H__
+
+/*
+ * Virtual CPU
+ */
+struct arch_state_struct {
+ unsigned long mode_flags; /* vm86, 32-bit, 64-bit, etc. */
+ /* debug registers */
+ /* MSRs */
+};
+
+#define VMX_MF_VM86 0
+#define VMX_MF_32 1
+#define VMX_MF_64 2
+
+#endif
--- /dev/null
+/*
+ * vmx_platform.h: VMX platform support
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_PLATFORM_H__
+#define __ASM_X86_VMX_PLATFORM_H__
+
+#include <asm/e820.h> /* from Linux */
+
+#endif
--- /dev/null
+/*
+ * vmx_vmcs.h: VMCS related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_VMCS_H__
+#define __ASM_X86_VMX_VMCS_H__
+
+#include <asm/config.h>
+#include <asm/vmx_cpu.h>
+#include <asm/vmx_platform.h>
+
+extern int start_vmx(void);
+extern void stop_vmx(void);
+
+void vmx_enter_scheduler(void);
+
+union vmcs_arbytes {
+ struct arbyte_fields {
+ unsigned int
+ seg_type: 4, s: 1, dpl: 2, p: 1,
+ reserved0: 4, avl: 1, reserved1: 1,
+ default_ops_size: 1, g: 1, null_bit: 1,
+ reserved2: 15;
+ } __attribute__((packed)) fields;
+ unsigned int bytes;
+};
+
+struct virutal_platform_def {
+ unsigned long *real_mode_data; /* E820, etc. */
+ unsigned long shared_page_va;
+};
+
+int vmx_setup_platform(struct exec_domain *, execution_context_t *);
+
+#define VMX_CPU_STATE_PG_ENABLED 0
+
+#define VMCS_SIZE 0x1000
+
+struct vmcs_struct {
+ u32 vmcs_revision_id;
+ unsigned char data [0x1000 - sizeof (u32)];
+};
+
+struct arch_vmx_struct {
+ struct vmcs_struct *vmcs; /* VMCS pointer in virtual */
+ unsigned long flags; /* VMCS flags */
+ unsigned long cpu_cr2; /* save CR2 */
+ unsigned long cpu_cr3;
+ unsigned long cpu_state;
+ struct virutal_platform_def vmx_platform;
+#if 0
+ /* open */
+ unsigned long *page_list; /* page list for MMIO */
+#endif
+};
+
+#define vmx_schedule_tail(next) \
+ (next)->thread.arch_vmx.arch_vmx_schedule_tail((next))
+
+#define VMX_DOMAIN(d) d->thread.arch_vmx.flags
+
+#define ARCH_VMX_VMCS_LOADED 0 /* VMCS has been loaded and active */
+#define ARCH_VMX_VMCS_LAUNCH 1 /* Needs VMCS launch */
+#define ARCH_VMX_VMCS_RESUME 2 /* Needs VMCS resume */
+#define ARCH_VMX_IO_WAIT 3 /* Waiting for I/O completion */
+
+void vmx_do_launch(struct exec_domain *);
+void vmx_do_resume(struct exec_domain *);
+
+struct vmcs_struct *alloc_vmcs(void);
+void free_vmcs(struct vmcs_struct *);
+int load_vmcs(struct arch_vmx_struct *, u64);
+int store_vmcs(struct arch_vmx_struct *, u64);
+void dump_vmcs(void);
+int construct_vmcs(struct arch_vmx_struct *, execution_context_t *,
+ full_execution_context_t *, int);
+
+#define VMCS_USE_HOST_ENV 1
+#define VMCS_USE_SEPARATE_ENV 0
+
+#define VMCS_EFLAGS_RESERVED_0 0xffc08028 /* bitmap for 0 */
+#define VMCS_EFLAGS_RESERVED_1 0x00000002 /* bitmap for 1 */
+
+extern int vmcs_version;
+
+/* VMCS Encordings */
+enum vmcs_field {
+ GUEST_ES_SELECTOR = 0x00000800,
+ GUEST_CS_SELECTOR = 0x00000802,
+ GUEST_SS_SELECTOR = 0x00000804,
+ GUEST_DS_SELECTOR = 0x00000806,
+ GUEST_FS_SELECTOR = 0x00000808,
+ GUEST_GS_SELECTOR = 0x0000080a,
+ GUEST_LDTR_SELECTOR = 0x0000080c,
+ GUEST_TR_SELECTOR = 0x0000080e,
+ HOST_ES_SELECTOR = 0x00000c00,
+ HOST_CS_SELECTOR = 0x00000c02,
+ HOST_SS_SELECTOR = 0x00000c04,
+ HOST_DS_SELECTOR = 0x00000c06,
+ HOST_FS_SELECTOR = 0x00000c08,
+ HOST_GS_SELECTOR = 0x00000c0a,
+ HOST_TR_SELECTOR = 0x00000c0c,
+ IO_BITMAP_A = 0x00002000,
+ IO_BITMAP_B = 0x00002002,
+ VM_EXIT_MSR_STORE_ADDR = 0x00002006,
+ VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
+ VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
+ TSC_OFFSET = 0x00002010,
+ GUEST_VMCS0 = 0x00002800,
+ GUEST_VMCS1 = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+ PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
+ PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
+ CR3_TARGET_COUNT = 0x0000400a,
+ VM_EXIT_CONTROLS = 0x0000400c,
+ VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ VM_ENTRY_CONTROLS = 0x00004012,
+ VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
+ VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
+ VM_EXIT_REASON = 0x00004402,
+ VM_EXIT_INTR_INFO = 0x00004404,
+ VM_EXIT_INTR_ERROR_CODE = 0x00004406,
+ IDT_VECTORING_INFO_FIELD = 0x00004408,
+ IDT_VECTORING_ERROR_CODE = 0x0000440a,
+ INSTRUCTION_LEN = 0x0000440c,
+ GUEST_ES_LIMIT = 0x00004800,
+ GUEST_CS_LIMIT = 0x00004802,
+ GUEST_SS_LIMIT = 0x00004804,
+ GUEST_DS_LIMIT = 0x00004806,
+ GUEST_FS_LIMIT = 0x00004808,
+ GUEST_GS_LIMIT = 0x0000480a,
+ GUEST_LDTR_LIMIT = 0x0000480c,
+ GUEST_TR_LIMIT = 0x0000480e,
+ GUEST_GDTR_LIMIT = 0x00004810,
+ GUEST_IDTR_LIMIT = 0x00004812,
+ GUEST_ES_AR_BYTES = 0x00004814,
+ GUEST_CS_AR_BYTES = 0x00004816,
+ GUEST_SS_AR_BYTES = 0x00004818,
+ GUEST_DS_AR_BYTES = 0x0000481a,
+ GUEST_FS_AR_BYTES = 0x0000481c,
+ GUEST_GS_AR_BYTES = 0x0000481e,
+ GUEST_LDTR_AR_BYTES = 0x00004820,
+ GUEST_TR_AR_BYTES = 0x00004822,
+ GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ CR0_GUEST_HOST_MASK = 0x00006000,
+ CR4_GUEST_HOST_MASK = 0x00006002,
+ CR0_READ_SHADOW = 0x00006004,
+ CR4_READ_SHADOW = 0x00006006,
+ CR3_TARGET_VALUES = 0x00006008,
+ CR3_GUEST_HOST_MASK = 0x00006208,
+ EXIT_QUALIFICATION = 0x00006400,
+ GUEST_CR0 = 0x00006800,
+ GUEST_CR3 = 0x00006802,
+ GUEST_CR4 = 0x00006804,
+ GUEST_ES_BASE = 0x00006806,
+ GUEST_CS_BASE = 0x00006808,
+ GUEST_SS_BASE = 0x0000680a,
+ GUEST_DS_BASE = 0x0000680c,
+ GUEST_FS_BASE = 0x0000680e,
+ GUEST_GS_BASE = 0x00006810,
+ GUEST_LDTR_BASE = 0x00006812,
+ GUEST_TR_BASE = 0x00006814,
+ GUEST_GDTR_BASE = 0x00006816,
+ GUEST_IDTR_BASE = 0x00006818,
+ GUEST_DR7 = 0x0000681a,
+ GUEST_ESP = 0x0000681c,
+ GUEST_EIP = 0x0000681e,
+ GUEST_EFLAGS = 0x00006820,
+ GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
+ HOST_CR0 = 0x00006c00,
+ HOST_CR3 = 0x00006c02,
+ HOST_CR4 = 0x00006c04,
+ HOST_FS_BASE = 0x00006c06,
+ HOST_GS_BASE = 0x00006c08,
+ HOST_TR_BASE = 0x00006c0a,
+ HOST_GDTR_BASE = 0x00006c0c,
+ HOST_IDTR_BASE = 0x00006c0e,
+ HOST_ESP = 0x00006c14,
+ HOST_EIP = 0x00006c16,
+};
+
+#define VMX_DEBUG 1
+#if VMX_DEBUG
+#define DBG_LEVEL_0 (1 << 0)
+#define DBG_LEVEL_1 (1 << 1)
+#define DBG_LEVEL_2 (1 << 2)
+#define DBG_LEVEL_3 (1 << 3)
+#define DBG_LEVEL_IO (1 << 4)
+#define DBG_LEVEL_VMMU (1 << 5)
+
+extern unsigned int opt_vmx_debug_level;
+#define VMX_DBG_LOG(level, _f, _a...) \
+ if ((level) & opt_vmx_debug_level) \
+ printk("[VMX]" _f "\n", ## _a )
+#else
+#define VMX_DBG_LOG(level, _f, _a...)
+#endif
+
+#define __vmx_bug(regs) \
+ do { \
+ printk("__vmx_bug at %s:%d\n", __FILE__, __LINE__); \
+ show_registers(regs); \
+ domain_crash(); \
+ } while (0)
+
+#endif /* ASM_X86_VMX_VMCS_H__ */
*/
typedef struct {
#define ECF_I387_VALID (1<<0)
+#define ECF_VMX_GUEST (2<<0)
unsigned long flags;
execution_context_t cpu_ctxt; /* User-level CPU registers */
char fpu_ctxt[256]; /* User-level FPU registers */
--- /dev/null
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ 1
+#define IOREQ_WRITE 0
+
+#define STATE_INVALID 0
+#define STATE_IOREQ_READY 1
+#define STATE_IOREQ_INPROCESS 2
+#define STATE_IORESP_READY 3
+
+#define IOPACKET_PORT 2
+
+/* VMExit dispatcher should cooperate with instruction decoder to
+ prepare this structure and notify service OS and DM by sending
+ virq */
+typedef struct {
+ u64 addr; /* physical address */
+ u64 size; /* size in bytes */
+ u64 count; /* for rep prefixes */
+ union {
+ u64 data; /* data */
+ void *pdata; /* pointer to data */
+ } u;
+ u8 state:5;
+ u8 pdata_valid:1; /* if 1, use pdata above */
+ u8 dir:1; /* 1=read, 0=write */
+ u8 port_mm:1; /* 0=portio, 1=mmio */
+} ioreq_t;
+
+#define MAX_VECTOR 256
+#define BITS_PER_BYTE 8
+#define INTR_LEN (MAX_VECTOR/(BITS_PER_BYTE * sizeof(unsigned long)))
+
+typedef struct {
+ ioreq_t vp_ioreq;
+ unsigned long vp_intr[INTR_LEN];
+} vcpu_iodata_t;
+
+#endif /* _IOREQ_H_ */
extern unsigned long volatile jiffies;
extern rwlock_t domlist_lock;
-struct domain;
-
/* A global pointer to the initial domain (DOM0). */
extern struct domain *dom0;
typedef __u64 uint64_t;
+struct domain;
+struct exec_domain;
#endif /* __TYPES_H__ */